In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
from collections import Counter, defaultdict
from utils.imdbmovie import get_movies
from statsmodels.tools import add_constant
import statsmodels.api as sm
import pickle
import torch

In [13]:
profession_frequency_df = pd.read_csv("data/analysis_data/top500_merged_profession_frequency.csv", index_col=None)
soc_frequency_df = pd.read_csv("data/analysis_data/soc_frequency.csv", index_col=None)
frequency_df = pd.read_csv("data/mentions/frequency.csv", index_col=None)
imdb_df = pd.read_csv("data/imdb/imdb.ngram.csv", index_col=None)
employment_df = pd.read_csv("data/employment/emp.csv", index_col=None)
mentions_df = pd.read_csv("data/mentions/mentions.word_filtered.sense_filtered.soc_mapped.merged.csv", index_col=None, dtype={"soc_code":str, "soc_name":str})

In [4]:
profession_frequency_df.columns, profession_frequency_df.shape

(Index(['profession_merge', '1950', '1951', '1952', '1953', '1954', '1955',
        '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964',
        '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973',
        '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
        '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
        '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
        '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
        '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'],
       dtype='object'),
 (500, 69))

In [5]:
profession_frequency_df.iloc[0]

profession_merge     accountant
1950                1.39417e-05
1951                2.91238e-06
1952                7.98228e-06
1953                2.95388e-06
                       ...     
2013                5.34861e-06
2014                4.94726e-06
2015                4.64196e-06
2016                6.72435e-06
2017                4.12004e-06
Name: 0, Length: 69, dtype: object

In [6]:
soc_frequency_df.columns, soc_frequency_df.shape

(Index(['soc_code', 'soc_name', '1950', '1951', '1952', '1953', '1954', '1955',
        '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964',
        '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973',
        '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
        '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
        '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
        '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
        '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'],
       dtype='object'),
 (23, 70))

In [7]:
frequency_df.columns

Index(['profession', 'no_pos_sense', '1950', '1951', '1952', '1953', '1954',
       '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963',
       '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972',
       '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981',
       '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990',
       '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
       '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'],
      dtype='object')

In [8]:
frequency_df.shape

(4336, 70)

In [9]:
imdb_df.columns

Index(['year', 'imdb_ID', 'xml_ID', 'n_sentences', 'n_words',
       'n_sentences_title', 'imdb_year', 'imdb_genres', 'imdb_countries',
       'imdb_kind', 'imdb_languages', 'imdb_title', 'first_imdb_country',
       'first_imdb_language', '1_gram_count', '2_gram_count', '3_gram_count',
       '4_gram_count', '5_gram_count'],
      dtype='object')

In [11]:
frequency_df["profession"].unique().size

4073

In [14]:
mentions_df.shape

(3657827, 19)

In [15]:
mentions_df.columns

Index(['profession', 'imdb', 'sent', 'rsi', 'left', 'mention', 'right',
       'start', 'end', 'pos', 'ner', 'sense', 'no_pos_sense', 'is_profession',
       'is_nopos_profession', 'is_person', 'profession_merge', 'soc_code',
       'soc_name'],
      dtype='object')

In [16]:
mentions_df["profession_merge"]

0             1st lieutenant
1             1st lieutenant
2             1st lieutenant
3             1st lieutenant
4             1st lieutenant
                 ...        
3657822            zoologist
3657823            zoologist
3657824    zoology professor
3657825    zoology professor
3657826    zoology professor
Name: profession_merge, Length: 3657827, dtype: object

In [17]:
imdb_df.shape

(135998, 19)

In [18]:
imdb_df.columns

Index(['year', 'imdb_ID', 'xml_ID', 'n_sentences', 'n_words',
       'n_sentences_title', 'imdb_year', 'imdb_genres', 'imdb_countries',
       'imdb_kind', 'imdb_languages', 'imdb_title', 'first_imdb_country',
       'first_imdb_language', '1_gram_count', '2_gram_count', '3_gram_count',
       '4_gram_count', '5_gram_count'],
      dtype='object')

In [19]:
professions_df = pd.read_csv("data/mentions/professions.word_filtered.sense_filtered.merged.csv", index_col=None)

In [21]:
professions_df.columns

Index(['profession', 'n_mentions', 'n_words', 'n_senses', 'n_noun_senses',
       'n_professional_senses', 'n_non_professional_senses',
       'profession_merge'],
      dtype='object')

In [25]:
movie_frequency_df = imdb_df[["imdb_ID", "year", "imdb_genres", "imdb_countries", "imdb_kind", "1_gram_count", "2_gram_count", "3_gram_count", "4_gram_count", "5_gram_count"]].copy()

In [30]:
movie_frequency_df.index = movie_frequency_df["imdb_ID"]

In [32]:
movie_frequency_df.drop(columns="imdb_ID", inplace=True)

In [23]:
professions = professions_df["profession_merge"].unique()[:500]

In [26]:
for profession in professions:
    movie_frequency_df[profession] = 0

In [41]:
movie_profession_frequency_dict = defaultdict(lambda: defaultdict(int))

In [43]:
for imdb_ID, movie_df in tqdm(mentions_df.groupby("imdb"), total=mentions_df["imdb"].unique().size, desc="imdb"):
    movie_ngram_counts = movie_frequency_df.loc[imdb_ID, ["1_gram_count", "2_gram_count", "3_gram_count", "4_gram_count", "5_gram_count"]].values
    for profession, movie_profession_df in movie_df.groupby("profession_merge"):
        if profession in professions:
            ngram = len(profession.split())
            frequency = len(movie_profession_df)/movie_ngram_counts[ngram - 1]
            movie_profession_frequency_dict[imdb_ID][profession] = frequency

imdb: 100%|██████████| 133133/133133 [14:52<00:00, 149.17it/s]


In [48]:
set(list(movie_profession_frequency_dict.keys())).issubset(set(movie_frequency_df.index))

True

In [56]:
imdb_profession_frequency_data = []
for imdb, profession_frequency_dict in tqdm(movie_profession_frequency_dict.items()):
    record = [imdb]
    for profession in professions:
        record.append(profession_frequency_dict[profession])
    imdb_profession_frequency_data.append(record)

100%|██████████| 135998/135998 [00:16<00:00, 8302.89it/s]


In [58]:
len(imdb_profession_frequency_data)

135998

In [59]:
len(imdb_profession_frequency_data[0])

501

In [62]:
len(list(professions) + ["imdb_ID"])

501

In [63]:
imdb_frequency_df = pd.DataFrame(data=imdb_profession_frequency_data, columns=["imdb_ID"] + professions.tolist())

In [64]:
imdb_frequency_df

Unnamed: 0,imdb_ID,doctor,police,captain,cop,officer,president,detective,teacher,lawyer,...,watcher,medical officer,border patrol,state trooper,gold digger,department head,chamberlain,cinematographer,chief executive,sketch artist
0,35423,0.000000,0.000000,0.000000,0.0,0.000000,0.000181,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36574,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38089,0.000000,0.000000,0.000772,0.0,0.000386,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40284,0.000000,0.000363,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40300,0.000472,0.000945,0.000787,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135993,2140039813,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135994,2140039815,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135995,2140039822,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135996,2140039828,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
xdf = imdb_df[["imdb_ID", "imdb_kind", "year", "imdb_genres", "imdb_countries"]].merge(imdb_frequency_df, on="imdb_ID", how="left")

In [67]:
xdf.columns

Index(['imdb_ID', 'imdb_kind', 'year', 'imdb_genres', 'imdb_countries',
       'doctor', 'police', 'captain', 'cop', 'officer',
       ...
       'watcher', 'medical officer', 'border patrol', 'state trooper',
       'gold digger', 'department head', 'chamberlain', 'cinematographer',
       'chief executive', 'sketch artist'],
      dtype='object', length=505)

In [71]:
xdf.iloc[:,5:].isna().sum().sum()

0

In [73]:
xdf["year"].describe()

count    135998.000000
mean       2002.273394
std          15.091057
min        1950.000000
25%        1997.000000
50%        2008.000000
75%        2013.000000
max        2018.000000
Name: year, dtype: float64

In [74]:
xdf.sort_values(by=["year","imdb_ID"])

Unnamed: 0,imdb_ID,imdb_kind,year,imdb_genres,imdb_countries,doctor,police,captain,cop,officer,...,watcher,medical officer,border patrol,state trooper,gold digger,department head,chamberlain,cinematographer,chief executive,sketch artist
3,40284,movie,1950,Drama;History,France;Italy,0.000000,0.000363,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40300,movie,1950,Drama,Mexico,0.000472,0.000945,0.000787,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,41134,movie,1950,Drama;Romance,United Kingdom,0.000328,0.000000,0.000000,0.000000,0.000164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,41181,movie,1950,Crime;Film-Noir;Thriller,United States,0.000702,0.001545,0.000000,0.000421,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,41227,movie,1950,Action;Adventure;Romance;Thriller,United States,0.000644,0.000184,0.004327,0.000000,0.000460,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58973,902246,,2018,,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85493,1825683,movie,2018,Action;Adventure;Sci-Fi,United States,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122827,4881806,movie,2018,Action;Adventure;Sci-Fi,United States,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127249,5463162,movie,2018,Action;Adventure;Comedy;Sci-Fi,United States,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
imdb_frequency_df = pd.read_csv("data/analysis_data/imdb_frequency.csv", index_col=None)

In [76]:
imdb_frequency_df.columns

Index(['imdb_ID', 'imdb_kind', 'year', 'imdb_genres', 'imdb_countries',
       'doctor', 'police', 'captain', 'cop', 'officer',
       ...
       'watcher', 'medical officer', 'border patrol', 'state trooper',
       'gold digger', 'department head', 'chamberlain', 'cinematographer',
       'chief executive', 'sketch artist'],
      dtype='object', length=505)

In [77]:
imdb_frequency_df.shape

(135993, 505)

In [78]:
imdb_frequency_df["imdb_ID"].unique().size

135993

In [79]:
imdb_frequency_df["imdb_kind"].unique()

array(['movie', 'tv series', 'episode', 'tv short', nan, 'tv mini series',
       'tv movie', 'video movie', 'video game'], dtype=object)

In [80]:
imdb_frequency_df["year"].unique()

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017])

In [82]:
movie_data = get_movies(imdb_frequency_df["imdb_ID"][:100])

In [83]:
len(movie_data)

100

In [84]:
movie_data.keys()

dict_keys(['0042498', '0042509', '0042513', '0042003', '0042005', '0042519', '0042524', '0042530', '0042531', '0042541', '0042030', '0042542', '0042546', '0042039', '0042052', '0042579', '0042591', '0042593', '0042604', '0042610', '0042106', '0042116', '0042629', '0042639', '0042643', '0042644', '0042646', '0042648', '0041640', '0042664', '0042665', '0042669', '0041134', '0042176', '0042179', '0042192', '0042194', '0042195', '0042198', '0042200', '0041181', '0041694', '0042206', '0042208', '0042209', '0042219', '0042226', '0042229', '0041719', '0042234', '0042235', '0041227', '0042256', '0042265', '0042274', '0042275', '0042276', '0042279', '0042280', '0042281', '0042285', '0042286', '0042287', '0042289', '0042295', '0042296', '0042301', '0042313', '0042323', '0041303', '0042327', '0040284', '0042332', '0042338', '0042343', '0042344', '0040300', '0042352', '0042355', '0042367', '0042369', '0042372', '0042376', '0042383', '0042393', '0042395', '0042397', '0041387', '0042426', '0042428',

In [85]:
movie_data["0040284"]

<Movie id:0040284[http] title:_Sins of Pompeii (1950)_>

In [87]:
movie_data["0040284"].keys()

['cast',
 'genres',
 'runtimes',
 'countries',
 'country codes',
 'language codes',
 'color info',
 'aspect ratio',
 'sound mix',
 'certificates',
 'original air date',
 'rating',
 'votes',
 'cover url',
 'plot outline',
 'languages',
 'title',
 'year',
 'kind',
 'directors',
 'writers',
 'producers',
 'composers',
 'cinematographers',
 'editors',
 'editorial department',
 'production designers',
 'set decorators',
 'costume designers',
 'make up department',
 'production managers',
 'assistant directors',
 'art department',
 'sound department',
 'special effects',
 'camera department',
 'music department',
 'miscellaneous',
 'akas',
 'writer',
 'director',
 'production companies',
 'distributors',
 'plot',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title',
 'full-size cover url']

In [90]:
bourne_data = get_movies(["0440963"])

In [91]:
bourne_data.keys()

dict_keys(['0440963'])

In [96]:
bourne_data["0440963"]["box office"]

{'Budget': '$110,000,000 (estimated)',
 'Opening Weekend United States': '$69,283,690, 05 Aug 2007',
 'Cumulative Worldwide Gross': '$442,824,138, 29 Nov 2007'}

In [97]:
bourne_data["0440963"]["box office"]["Opening Weekend United States"].split()[0].lstrip("$").replace(",", "")

'69283690'

In [95]:
movie_data = get_movies(imdb_df["imdb_ID"], verbose=True)

Getting IMDb data...: 100%|██████████| 135998/135998 [12:50<00:00, 176.41it/s] 


In [98]:
for data in movie_data.values():
    try:
        box = data["box office"]["Opening Weekend United States"].split()[0].lstrip("$").replace(",", "")
        print(box)
    except Exception:
        pass

6003
99462
11020798
5900000
14506
6281415
13335
19001
16349565
53452
27652
53003468
6518
51185
68108790
17135
6878437
17275239
5310711
52823
15831700
5254
927161
1573454
11277367
7009513
74618
37607
13351
7004254
17728313
30082000
17128062
57640
198461
18931
4208655
59215365
10422
3467
346999
3629
294056
1714000
8007059
9307394
30181877
193021
11102948
29027348
27000
6003806
15077
11554015
46353
11208851
321515
23618556
15491
2874
30514
66286
13455016
1561949
6616571
16021684
276981
4770360
18730762
6041521
90589
12289375
12001256
5345250
35201
18510
10177257
49038712
260865
197773
60551
11161074
44523
263002
15210156
12691415
11019224
10749
114530
5554594
16687773
140822
22089322
2416
24660
10367
1461
1340
21689125
6773870
6575000
18129
5064077
3251884
217332
54607747
4750894
17816230
20300
1402823
3447
19112404
7702439
54155312
2054
384478
103629
21577049
72611427
46312454
18554948
83200
89499
22610437
14506464
108966307
2501096
7366207
30097040
8935426
7720942
13482638
12562
20828
3

In [99]:
imdb_df["imdb_ID"].dtype

dtype('int64')

In [100]:
imdb_frequency_df = pd.read_csv("data/analysis_data/imdb_frequency.csv", index_col=None)

In [110]:
genres = [genre for genre_list in imdb_frequency_df["imdb_genres"].dropna().str.split(";") for genre in genre_list]

In [111]:
len(genres)

379332

In [112]:
len(set(genres))

28

In [113]:
Counter(genres)

Counter({'Drama': 77459,
         'History': 4871,
         'Romance': 23314,
         'Crime': 27052,
         'Film-Noir': 172,
         'Thriller': 26600,
         'Action': 25559,
         'Adventure': 19603,
         'Comedy': 50867,
         'Family': 13000,
         'Fantasy': 15538,
         'War': 3942,
         'Biography': 2724,
         'Sport': 1674,
         'Mystery': 24113,
         'Musical': 1874,
         'Western': 1766,
         'Animation': 11726,
         'Short': 3072,
         'Sci-Fi': 19216,
         'Music': 3231,
         'Documentary': 7099,
         'Horror': 10105,
         'News': 401,
         'Talk-Show': 348,
         'Adult': 204,
         'Game-Show': 1441,
         'Reality-TV': 2361})

In [115]:
countries = [country for country_list in imdb_frequency_df["imdb_countries"].dropna().str.split(";") for country in country_list]

In [116]:
len(set(countries))

179

In [119]:
country_items = sorted(Counter(countries).items(), key=lambda x: x[1], reverse=True)

In [132]:
top10_countries = [country for country, _ in country_items[:10]]
top20_countries = [country for country, _ in country_items[:20]]
top25_countries = [country for country, _ in country_items[:25]]
top30_countries = [country for country, _ in country_items[:30]]

In [121]:
top10_countries

['United States',
 'United Kingdom',
 'Canada',
 'France',
 'Japan',
 'Italy',
 'Germany',
 'India',
 'Hong Kong',
 'Spain']

In [141]:
top25_countries

['United States',
 'United Kingdom',
 'Canada',
 'France',
 'Japan',
 'Italy',
 'Germany',
 'India',
 'Hong Kong',
 'Spain',
 'South Korea',
 'Australia',
 'Sweden',
 'Denmark',
 'West Germany',
 'Mexico',
 'Belgium',
 'China',
 'Netherlands',
 'Norway',
 'Poland',
 'Brazil',
 'Argentina',
 'Soviet Union',
 'Thailand']

In [122]:
imdb_frequency_df["imdb_countries"].dropna().size

100347

In [125]:
imdb_frequency_df["imdb_countries"].dropna().str.split(";").apply(lambda x: len(set(x).intersection(set(top10_countries))) > 0).sum()

88573

In [126]:
imdb_frequency_df["imdb_countries"].dropna().str.split(";").apply(lambda x: len(set(x).intersection(set(top10_countries))) > 0).sum()/imdb_frequency_df["imdb_countries"].dropna().size

0.8826671450068263

In [129]:
imdb_frequency_df["imdb_countries"].dropna().str.split(";").apply(lambda x: len(set(x).intersection(set(top20_countries))) > 0).sum()/imdb_frequency_df["imdb_countries"].dropna().size

0.9408253360837893

In [131]:
imdb_frequency_df["imdb_countries"].dropna().str.split(";").apply(lambda x: len(set(x).intersection(set(top30_countries))) > 0).sum()/imdb_frequency_df["imdb_countries"].dropna().size

0.9718277576808475

In [133]:
imdb_frequency_df["imdb_countries"].dropna().str.split(";").apply(lambda x: len(set(x).intersection(set(top25_countries))) > 0).sum()/imdb_frequency_df["imdb_countries"].dropna().size

0.9611448274487528

In [139]:
np.array(["x", "t", "w"]) + np.array(["w"])

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U1'), dtype('<U1')) -> dtype('<U1')

In [140]:
imdb_frequency_df["imdb_kind"].dropna().unique()

array(['movie', 'tv series', 'episode', 'tv short', 'tv mini series',
       'tv movie', 'video movie', 'video game'], dtype=object)

In [142]:
imdb_frequency_df.columns

Index(['imdb_ID', 'imdb_kind', 'year', 'imdb_genres', 'imdb_countries',
       'doctor', 'police', 'captain', 'cop', 'officer',
       ...
       'watcher', 'medical officer', 'border patrol', 'state trooper',
       'gold digger', 'department head', 'chamberlain', 'cinematographer',
       'chief executive', 'sketch artist'],
      dtype='object', length=505)

In [143]:
imdb_kind_set = imdb_frequency_df["imdb_kind"].dropna().unique().tolist()
imdb_kind_data = np.zeros((len(imdb_frequency_df), len(imdb_kind_set)), dtype=np.int)
for i, imdb_kind in tqdm(enumerate(imdb_frequency_df["imdb_kind"]), desc="imdb_kind"):
    if pd.notna(imdb_kind):
        imdb_kind_data[i, imdb_kind_set.index(imdb_kind)] = 1
imdb_kind_header = [f"kind_{kind.replace(' ','_')}" for kind in imdb_kind_set]

imdb_genre_set = list(set([genre for genre_list in imdb_frequency_df["imdb_genres"].dropna().str.split(";") for genre in genre_list]))
imdb_genre_data = np.zeros((len(imdb_frequency_df), len(imdb_genre_set)), dtype=np.int)
for i, imdb_genres in tqdm(enumerate(imdb_frequency_df["imdb_genres"]), desc="imdb_genre"):
    if pd.notna(imdb_genres):
        for genre in imdb_genres.split(";"):
            imdb_genre_data[i, imdb_genre_set.index(genre)] = 1
imdb_genre_header = [f"genre_{genre.replace('-','_')}" for genre in imdb_genre_set]

imdb_country_list = [country for country_list in imdb_frequency_df["imdb_countries"].dropna().str.split(";") for country in country_list]
imdb_country_items = sorted(Counter(imdb_country_list).items(), key = lambda x: x[1], reverse=True)
imdb_country_set = [country for country, _ in imdb_country_items[:25]]
imdb_country_data = np.zeros((len(imdb_frequency_df), len(imdb_country_set)), dtype=np.int)
for i, imdb_countries in tqdm(enumerate(imdb_frequency_df["imdb_countries"]), desc="imdb_country"):
    if pd.notna(imdb_countries):
        for country in imdb_countries.split(";"):
            if country in imdb_country_set:
                imdb_country_data[i, imdb_country_set.index(country)] = 1
imdb_country_header = [f"country_{country.replace(' ','_')}" for country in imdb_country_set]

imdb_kind: 135993it [00:00, 166600.21it/s]
imdb_genre: 135993it [00:01, 78652.49it/s]
imdb_country: 135993it [00:01, 117767.85it/s]


In [144]:
imdb_frequency_df = imdb_frequency_df.drop(columns=["imdb_kind", "imdb_genres", "imdb_countries"])
imdb_frequency_df = pd.concat([imdb_frequency_df, pd.DataFrame(data=imdb_kind_data, columns=imdb_kind_header)], axis=1)
imdb_frequency_df = pd.concat([imdb_frequency_df, pd.DataFrame(data=imdb_genre_data, columns=imdb_genre_header)], axis=1)
imdb_frequency_df = pd.concat([imdb_frequency_df, pd.DataFrame(data=imdb_country_data, columns=imdb_country_header)], axis=1)

In [148]:
imdb_frequency_df = pd.read_csv("data/analysis_data/imdb_frequency.csv", index_col=None)

In [150]:
imdb_frequency_df.columns.tolist()

['imdb_ID',
 'year',
 'kind_movie',
 'kind_tv_series',
 'kind_episode',
 'kind_tv_short',
 'kind_tv_mini_series',
 'kind_tv_movie',
 'kind_video_movie',
 'kind_video_game',
 'genre_Adult',
 'genre_Fantasy',
 'genre_Game_Show',
 'genre_Action',
 'genre_Adventure',
 'genre_News',
 'genre_Film_Noir',
 'genre_Short',
 'genre_Western',
 'genre_Thriller',
 'genre_Horror',
 'genre_Crime',
 'genre_Sci_Fi',
 'genre_Comedy',
 'genre_Mystery',
 'genre_Talk_Show',
 'genre_Family',
 'genre_Animation',
 'genre_Musical',
 'genre_War',
 'genre_Reality_TV',
 'genre_Music',
 'genre_History',
 'genre_Sport',
 'genre_Biography',
 'genre_Romance',
 'genre_Drama',
 'genre_Documentary',
 'country_United_States',
 'country_United_Kingdom',
 'country_Canada',
 'country_France',
 'country_Japan',
 'country_Italy',
 'country_Germany',
 'country_India',
 'country_Hong_Kong',
 'country_Spain',
 'country_South_Korea',
 'country_Australia',
 'country_Sweden',
 'country_Denmark',
 'country_West_Germany',
 'country_Me

In [153]:
(imdb_frequency_df.iloc[:,-500:]==0).sum().sum()

65189651

In [155]:
professions

array(['doctor', 'police', 'captain', 'cop', 'officer', 'president',
       'detective', 'teacher', 'lawyer', 'soldier', 'general',
       'lieutenant', 'professor', 'director', 'judge', 'sergeant',
       'commander', 'guard', 'artist', 'manager', 'inspector', 'minister',
       'nurse', 'district attorney', 'coach', 'chef', 'sheriff', 'actor',
       'priest', 'mayor', 'major', 'secretary', 'attorney', 'scientist',
       'writer', 'reporter', 'governor', 'pilot', 'spy', 'hunter',
       'senator', 'singer', 'policeman', 'farmer', 'maid', 'dancer',
       'deputy', 'commissioner', 'bobby', 'producer', 'engineer',
       'surgeon', 'police officer', 'chairman', 'actress', 'cook', 'monk',
       'ranger', 'journalist', 'cowboy', 'prosecutor', 'principal',
       'editor', 'admiral', 'designer', 'poet', 'sailor', 'ambassador',
       'musician', 'marshall', 'counsellor', 'photographer', 'nanny',
       'waiter', 'dealer', 'bishop', 'marine', 'scout', 'nun',
       'investigator', 'thera

In [156]:
imdb_frequency_df[professions]

Unnamed: 0,doctor,police,captain,cop,officer,president,detective,teacher,lawyer,soldier,...,watcher,medical officer,border patrol,state trooper,gold digger,department head,chamberlain,cinematographer,chief executive,sketch artist
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000181,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000772,0.0,0.000386,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000363,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000472,0.000945,0.000787,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135993,,,,,,,,,,,...,,,,,,,,,,
135994,,,,,,,,,,,...,,,,,,,,,,
135995,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135996,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000097,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
imdb_frequency_df[professions].isna().all(axis=1).sum()

3057

In [161]:
imdb_frequency_df[professions] = imdb_frequency_df[professions].fillna(0)

In [163]:
imdb_frequency_df.isna().sum().sum()

130569

In [164]:
imdb_frequency_df[professions].isna().sum().sum()

0

In [165]:
imdb_frequency_df.columns

Index(['imdb_ID', 'year', 'kind_movie', 'kind_tv_series', 'kind_episode',
       'kind_tv_short', 'kind_tv_mini_series', 'kind_tv_movie',
       'kind_video_movie', 'kind_video_game',
       ...
       'watcher', 'medical officer', 'border patrol', 'state trooper',
       'gold digger', 'department head', 'chamberlain', 'cinematographer',
       'chief executive', 'sketch artist'],
      dtype='object', length=564)

In [166]:
imdb_frequency_df.columns[1:-500]

Index(['year', 'kind_movie', 'kind_tv_series', 'kind_episode', 'kind_tv_short',
       'kind_tv_mini_series', 'kind_tv_movie', 'kind_video_movie',
       'kind_video_game', 'genre_Adult', 'genre_Fantasy', 'genre_Game_Show',
       'genre_Action', 'genre_Adventure', 'genre_News', 'genre_Film_Noir',
       'genre_Short', 'genre_Western', 'genre_Thriller', 'genre_Horror',
       'genre_Crime', 'genre_Sci_Fi', 'genre_Comedy', 'genre_Mystery',
       'genre_Talk_Show', 'genre_Family', 'genre_Animation', 'genre_Musical',
       'genre_War', 'genre_Reality_TV', 'genre_Music', 'genre_History',
       'genre_Sport', 'genre_Biography', 'genre_Romance', 'genre_Drama',
       'genre_Documentary', 'country_United_States', 'country_United_Kingdom',
       'country_Canada', 'country_France', 'country_Japan', 'country_Italy',
       'country_Germany', 'country_India', 'country_Hong_Kong',
       'country_Spain', 'country_South_Korea', 'country_Australia',
       'country_Sweden', 'country_Denmark', 'c

In [168]:
variables = imdb_frequency_df.columns[1:-500]

In [167]:
imdb_frequency_df.columns[1:-500].shape

(63,)

In [170]:
imdb_frequency_df[variables].isna().sum().sum()/imdb_frequency_df[variables].size

0.015238786191300199

In [171]:
X = imdb_frequency_df[variables]

In [172]:
Y = imdb_frequency_df["doctor"]

In [178]:
X = sm.add_constant(X)

In [187]:
ols = sm.OLS(Y, X, missing="drop")

In [188]:
results = ols.fit()

In [189]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 doctor   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     1.571
Date:                Wed, 17 Feb 2021   Prob (F-statistic):            0.00274
Time:                        10:09:14   Log-Likelihood:                 33983.
No. Observations:                5744   AIC:                        -6.784e+04
Df Residuals:                    5680   BIC:                        -6.741e+04
Df Model:                          63                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0

In [190]:
imdb_frequency_df.columns[:-500]

Index(['imdb_ID', 'year', 'kind_movie', 'kind_tv_series', 'kind_episode',
       'kind_tv_short', 'kind_tv_mini_series', 'kind_tv_movie',
       'kind_video_movie', 'kind_video_game', 'genre_Adult', 'genre_Fantasy',
       'genre_Game_Show', 'genre_Action', 'genre_Adventure', 'genre_News',
       'genre_Film_Noir', 'genre_Short', 'genre_Western', 'genre_Thriller',
       'genre_Horror', 'genre_Crime', 'genre_Sci_Fi', 'genre_Comedy',
       'genre_Mystery', 'genre_Talk_Show', 'genre_Family', 'genre_Animation',
       'genre_Musical', 'genre_War', 'genre_Reality_TV', 'genre_Music',
       'genre_History', 'genre_Sport', 'genre_Biography', 'genre_Romance',
       'genre_Drama', 'genre_Documentary', 'country_United_States',
       'country_United_Kingdom', 'country_Canada', 'country_France',
       'country_Japan', 'country_Italy', 'country_Germany', 'country_India',
       'country_Hong_Kong', 'country_Spain', 'country_South_Korea',
       'country_Australia', 'country_Sweden', 'country_D

In [191]:
basterds_movie = get_movies(["0361748"])["0361748"]

In [192]:
basterds_movie

<Movie id:0361748[http] title:_Inglourious Basterds (2009)_>

In [193]:
basterds_movie.keys()

['cast',
 'genres',
 'runtimes',
 'countries',
 'country codes',
 'language codes',
 'color info',
 'aspect ratio',
 'sound mix',
 'box office',
 'certificates',
 'original air date',
 'rating',
 'votes',
 'cover url',
 'plot outline',
 'languages',
 'title',
 'year',
 'kind',
 'directors',
 'writers',
 'producers',
 'cinematographers',
 'editors',
 'editorial department',
 'production designers',
 'art directors',
 'set decorators',
 'costume designers',
 'make up department',
 'production managers',
 'assistant directors',
 'art department',
 'sound department',
 'special effects',
 'visual effects',
 'stunts',
 'camera department',
 'casting department',
 'costume departmen',
 'location management',
 'music department',
 'script department',
 'transportation department',
 'miscellaneous',
 'thanks',
 'akas',
 'writer',
 'director',
 'top 250 rank',
 'production companies',
 'distributors',
 'special effects companies',
 'other companies',
 'plot',
 'synopsis',
 'canonical title',
 '

In [194]:
basterds_movie["kind"]

'movie'

In [195]:
len(professions)

500

In [196]:
sum([len(profession.split()) == 1 for profession in professions])

409

In [197]:
soc_df = pd.read_csv("data/gazetteer/soc.csv", index_col=None)

In [198]:
soc_df.head()

Unnamed: 0,2018 SOC Code,2018 SOC Title,2018 SOC Direct Match Title,Illustrative Example
0,11-1011,Chief Executives,Admiral,x
1,11-1011,Chief Executives,CEO,
2,11-1011,Chief Executives,Chief Executive Officer,
3,11-1011,Chief Executives,Chief Financial Officer,x
4,11-1011,Chief Executives,Chief Operating Officer,x


In [199]:
soc_df["2018 SOC Direct Match Title"].dropna().unique().size

6520

In [203]:
(soc_df["2018 SOC Direct Match Title"].dropna().drop_duplicates().str.split().agg(len) == 1).sum()

426

In [204]:
sum([len(profession.split()) == 1 for profession in professions])

409

In [205]:
mentionsI = pd.read_csv("data/mentions/mentions.word_filtered.csv", index_col=None)
mentionsII = pd.read_csv("data/mentions/mentions.word_filtered.prediction_added.csv", index_col=None)
mentionsIII = pd.read_csv("data/mentions/mentions.word_filtered.sense_filtered.csv", index_col=None)
mentionsIV = pd.read_csv("data/mentions/mentions.word_filtered.sense_filtered.soc_mapped.merged.csv", index_col=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [206]:
mentionsI.shape, mentionsII.shape, mentionsIII.shape, mentionsIV.shape

((4462624, 13), (4462624, 16), (3657827, 16), (3657827, 19))

In [207]:
mentionsII["is_nopos_profession"].sum()

3657827

In [208]:
mentionsII["is_profession"].sum()

3679183

In [209]:
mentionsII.columns

Index(['profession', 'imdb', 'sent', 'rsi', 'left', 'mention', 'right',
       'start', 'end', 'pos', 'ner', 'sense', 'no_pos_sense', 'is_profession',
       'is_nopos_profession', 'is_person'],
      dtype='object')

In [210]:
mentionsII["profession"]

0             1st lieutenant
1             1st lieutenant
2             1st lieutenant
3             1st lieutenant
4             1st lieutenant
                 ...        
4462619            zoologist
4462620            zoologist
4462621    zoology professor
4462622    zoology professor
4462623    zoology professor
Name: profession, Length: 4462624, dtype: object

In [212]:
mentions_df.shape

(3657827, 19)

In [213]:
mentions_df["left"] = mentions_df["left"].fillna("").str.strip()
mentions_df["right"] = mentions_df["right"].fillna("").str.strip()
mentions_df["mention"] = mentions_df["mention"].str.strip()
data = mentions_df[["left","mention","right"]].copy().values

In [214]:
data

array([['', '1st Lieutenant', 'Evan Connors .'],
       ['', '1st Lieutenant', 'Hegger .'],
       ['', '1st Lieutenant', 'Hegger will lead the operation .'],
       ...,
       ["You two may be wondering that , though I 'm a",
        'zoology professor', '. .'],
       ['A boy first advocated it , then a', 'zoology professor',
        'dictated it .'],
       ['I wish to inquire what possible relevance the testimony of a',
        'zoology professor', 'can have in this trial']], dtype=object)

In [215]:
data.shape

(3657827, 3)

In [217]:
len(data)

3657827

In [228]:
subdata = data[:1000]

In [229]:
res = np.array_split(subdata, len(subdata)//64 + 1)

In [230]:
len(res)

16

In [231]:
for x in res:
    print(x.shape)

(63, 3)
(63, 3)
(63, 3)
(63, 3)
(63, 3)
(63, 3)
(63, 3)
(63, 3)
(62, 3)
(62, 3)
(62, 3)
(62, 3)
(62, 3)
(62, 3)
(62, 3)
(62, 3)


In [232]:
subdata[990:1020]

array([['You know , I think that bizarro', 'accountant',
        'is greasing his own palms .'],
       ['Probably two', 'accountants', 'in the safe area itself .'],
       ['The', 'accountant', 'is arriving soon .'],
       ["I 'll leave him to you , I 'm off to the", 'accountant', '.'],
       ["Carmine O ' Brien is an", 'accountant', 'from St. Louis .'],
       ['Frederic ,', 'accountant', '.'],
       ['- Now , after we see the', 'accountants', ','],
       ['" According to my', 'accountant',
        ", you , my children , have for some time now exhausted every rightful claim to the estate I have administered since your mother 's death ."],
       ['- Sir ,', 'Accountant', '...'],
       ['With clutches like yours you were an', 'accountant', '?']],
      dtype=object)