In [36]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
import itertools
import statsmodels.formula.api as smf

In [2]:
profession_media_df = pd.read_csv('../data/analysis_data/profession.media_attribute.csv', index_col=None)
soc_media_df = pd.read_csv('../data/analysis_data/soc.media_attribute.csv', index_col=None)

In [3]:
profession_media_df

Unnamed: 0,profession,year,kind,genres,countries,n_titles,n_mentions,n_total_mentions,n_pos_mentions,n_neg_mentions
0,accountant,1950,movie,Adventure;Drama,Italy,1,15,7219,1,0
1,accountant,1950,movie,Comedy;Drama;Romance,Greece,1,1,12555,0,0
2,accountant,1950,movie,Comedy;Drama;Romance,United States,1,2,18053,0,0
3,accountant,1950,movie,Comedy;Romance,United States,1,1,16417,0,0
4,accountant,1950,movie,Crime,West Germany,1,1,13039,0,0
...,...,...,...,...,...,...,...,...,...,...
830318,writer,2017,tv movie,Documentary;Music,United States,1,2,10737,2,0
830319,writer,2017,tv movie,Drama;Thriller,United States,1,1,10305,1,0
830320,writer,2017,tv movie,Thriller,Canada,1,4,9485,1,1
830321,writer,2017,tv series,Drama,United States,1,5,6742,3,0


In [4]:
professions = profession_media_df.profession.unique()

In [5]:
len(professions)

500

In [6]:
profession_ngram_sizes = np.array([len(p.split()) for p in professions])

In [8]:
Counter(profession_ngram_sizes)

Counter({1: 409, 2: 86, 3: 4, 5: 1})

In [19]:
def find_ntitles(columns):
    return (profession_media_df.groupby(['profession'] + columns, dropna=False).agg({'n_titles': sum}) >= 30).sum().item()

In [39]:
columns = ['year','kind','genres','countries']

columns_list = []
dfs = []

for i in range(1, 5):
    for column_tuple in itertools.combinations(columns, i):
        column_list = [c for c in column_tuple]
        column_text = ' '.join(column_list)
        
        df = profession_media_df.groupby(['profession'] + column_list, dropna=True).sum()
        df = df[df['n_titles'] >= 30]
        
        n_samples = len(df)
        
        print('{:50s} #samples = {:5d}'.format(column_text, n_samples))
        
        columns_list.append(column_list)
        dfs.append(df)

year                                               #samples =  8107
kind                                               #samples =  1622
genres                                             #samples =  6363
countries                                          #samples =  3054
year kind                                          #samples =  8556
year genres                                        #samples =  2756
year countries                                     #samples =  4595
kind genres                                        #samples =  6172
kind countries                                     #samples =  3798
genres countries                                   #samples =  3582
year kind genres                                   #samples =  1904
year kind countries                                #samples =  3692
year genres countries                              #samples =   657
kind genres countries                              #samples =  3232
year kind genres countries                      

In [56]:
df = dfs[-1]
variables = columns_list[-1]

In [72]:
for var in variables + ['profession']:
    values = df.index.get_level_values(var).unique()
    print('{:20s} #values = {:3d}'.format(var, len(values)))

year                 #values =  40
kind                 #values =   2
genres               #values =  25
countries            #values =   4
profession           #values =  35


In [69]:
rdf = df.reset_index()

In [70]:
rdf.profession.unique().size

35

In [71]:
rdf.profession.unique()

array(['attorney', 'captain', 'chef', 'commander', 'commissioner', 'cook',
       'cop', 'counsellor', 'detective', 'director', 'district attorney',
       'doctor', 'guard', 'hacker', 'inspector', 'judge', 'lawyer',
       'lieutenant', 'manager', 'mayor', 'minister', 'nurse', 'officer',
       'operative', 'police', 'president', 'professor', 'ranger',
       'secretary', 'sergeant', 'sheriff', 'soldier', 'surgeon',
       'teacher', 'therapist'], dtype=object)

In [76]:
nsamples_per_group = rdf.groupby('profession').agg(len)['n_mentions'].values
print('median #samples/group = {:.3f}'.format(np.median(nsamples_per_group)))
print('mean #samples/group = {:.3f}'.format(np.mean(nsamples_per_group)))
print('std.dev #samples/group = {:.3f}'.format(np.std(nsamples_per_group)))

median #samples/group = 4.000
mean #samples/group = 15.229
std.dev #samples/group = 26.969


In [77]:
rdf.groupby('profession').agg(len)['n_mentions']

profession
attorney              24
captain               16
chef                   3
commander              4
commissioner           2
cook                   1
cop                   53
counsellor             3
detective             32
director               4
district attorney      8
doctor               142
guard                  3
hacker                 1
inspector              6
judge                 16
lawyer                44
lieutenant             8
manager                2
mayor                  1
minister               1
nurse                  6
officer               45
operative              1
police                63
president             10
professor              1
ranger                 3
secretary              3
sergeant               5
sheriff                1
soldier                3
surgeon                8
teacher                8
therapist              2
Name: n_mentions, dtype: int64

In [78]:
(rdf.groupby('profession').agg(len)['n_mentions'] >= 10).sum()

10

In [84]:
rdf

Unnamed: 0,profession,year,kind,genres,countries,n_titles,n_mentions,n_total_mentions,n_pos_mentions,n_neg_mentions
0,attorney,1958,episode,Crime;Drama;Mystery,United States,31,145,250969,10,0
1,attorney,2004,episode,Crime;Drama;Mystery;Thriller,United States,30,59,218784,3,3
2,attorney,2005,episode,Crime;Drama;Mystery;Thriller,United States,35,62,253541,2,0
3,attorney,2006,episode,Crime;Drama;Mystery;Thriller,United States,50,120,361678,9,2
4,attorney,2007,episode,Crime;Drama;Mystery,United States,44,154,321749,11,8
...,...,...,...,...,...,...,...,...,...,...
528,teacher,2013,episode,Comedy,United States,31,52,156560,9,4
529,teacher,2014,episode,Comedy,United States,38,73,183644,10,10
530,teacher,2016,episode,Comedy,United States,37,112,170925,18,12
531,therapist,2011,episode,Comedy;Drama;Romance,United States,33,72,253658,10,5


In [85]:
rdf['freq'] = rdf['n_mentions']/rdf['n_total_mentions']
rdf['sentiment'] = rdf['n_pos_mentions']/(rdf['n_pos_mentions'] + rdf['n_neg_mentions'] + 1e-23)

In [86]:
rdf

Unnamed: 0,profession,year,kind,genres,countries,n_titles,n_mentions,n_total_mentions,n_pos_mentions,n_neg_mentions,freq,sentiment
0,attorney,1958,episode,Crime;Drama;Mystery,United States,31,145,250969,10,0,0.000578,1.000000
1,attorney,2004,episode,Crime;Drama;Mystery;Thriller,United States,30,59,218784,3,3,0.000270,0.500000
2,attorney,2005,episode,Crime;Drama;Mystery;Thriller,United States,35,62,253541,2,0,0.000245,1.000000
3,attorney,2006,episode,Crime;Drama;Mystery;Thriller,United States,50,120,361678,9,2,0.000332,0.818182
4,attorney,2007,episode,Crime;Drama;Mystery,United States,44,154,321749,11,8,0.000479,0.578947
...,...,...,...,...,...,...,...,...,...,...,...,...
528,teacher,2013,episode,Comedy,United States,31,52,156560,9,4,0.000332,0.692308
529,teacher,2014,episode,Comedy,United States,38,73,183644,10,10,0.000398,0.500000
530,teacher,2016,episode,Comedy,United States,37,112,170925,18,12,0.000655,0.600000
531,therapist,2011,episode,Comedy;Drama;Romance,United States,33,72,253658,10,5,0.000284,0.666667


In [88]:
model = smf.mixedlm('freq ~ year + kind + genres + countries', rdf, groups=rdf['profession']).fit()



In [89]:
model.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,freq
No. Observations:,533,Method:,REML
No. Groups:,35,Scale:,0.0000
Min. group size:,1,Likelihood:,3086.8785
Max. group size:,142,Converged:,Yes
Mean group size:,15.2,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.018,0.007,2.613,0.009,0.004,0.031
kind[T.movie],-0.001,0.000,-2.615,0.009,-0.001,-0.000
genres[T.Action;Adventure;Drama;Sci-Fi],0.000,0.000,0.022,0.982,-0.001,0.001
genres[T.Action;Adventure;Family;Fantasy;Sci-Fi],0.007,0.001,7.142,0.000,0.005,0.009
genres[T.Action;Adventure;Sci-Fi],0.002,0.000,4.438,0.000,0.001,0.003
genres[T.Action;Crime;Drama],-0.000,0.000,-0.601,0.548,-0.001,0.001
genres[T.Action;Crime;Drama;Mystery;Sci-Fi;Thriller],0.000,0.001,0.125,0.901,-0.001,0.001
genres[T.Action;Crime;Drama;Mystery;Thriller],-0.000,0.000,-0.427,0.669,-0.001,0.001
genres[T.Action;Crime;Drama;Thriller],-0.000,0.000,-0.396,0.692,-0.001,0.001


In [90]:
profession_media_df.shape

(830323, 10)

In [91]:
profession_media_df

Unnamed: 0,profession,year,kind,genres,countries,n_titles,n_mentions,n_total_mentions,n_pos_mentions,n_neg_mentions
0,accountant,1950,movie,Adventure;Drama,Italy,1,15,7219,1,0
1,accountant,1950,movie,Comedy;Drama;Romance,Greece,1,1,12555,0,0
2,accountant,1950,movie,Comedy;Drama;Romance,United States,1,2,18053,0,0
3,accountant,1950,movie,Comedy;Romance,United States,1,1,16417,0,0
4,accountant,1950,movie,Crime,West Germany,1,1,13039,0,0
...,...,...,...,...,...,...,...,...,...,...
830318,writer,2017,tv movie,Documentary;Music,United States,1,2,10737,2,0
830319,writer,2017,tv movie,Drama;Thriller,United States,1,1,10305,1,0
830320,writer,2017,tv movie,Thriller,Canada,1,4,9485,1,1
830321,writer,2017,tv series,Drama,United States,1,5,6742,3,0


In [92]:
prof_df = profession_media_df.copy()

In [93]:
Counter(prof_df.kind)

Counter({'movie': 455840,
         'episode': 291473,
         'tv mini series': 9972,
         'tv movie': 33953,
         'tv short': 1673,
         'tv series': 15023,
         'video movie': 18038,
         nan: 3266,
         'video game': 1085})

In [94]:
Counter(prof_df.genres)

Counter({'Adventure;Drama': 1586,
         'Comedy;Drama;Romance': 18100,
         'Comedy;Romance': 17797,
         'Crime': 1462,
         'Crime;Drama;Film-Noir;Romance;Thriller': 151,
         'Documentary;Short': 2785,
         'Drama': 62584,
         'Comedy': 47640,
         'Comedy;Drama': 29857,
         'Comedy;Drama;Fantasy': 1475,
         'Crime;Drama;Mystery': 16661,
         'Crime;Drama;Mystery;Thriller': 19657,
         'Drama;Film-Noir;Romance': 22,
         'Crime;Drama;Film-Noir': 424,
         'Comedy;Family': 4202,
         'Adventure;Western': 242,
         'Crime;Drama;Film-Noir;Thriller': 588,
         'Crime;Horror;Sci-Fi;Thriller': 56,
         'Drama;Film-Noir': 219,
         'Drama;Romance': 24567,
         'Comedy;Crime;Drama;Horror;Mystery;Thriller': 535,
         'Comedy;Musical;Romance': 1019,
         'Drama;Family': 2696,
         'Adventure;Drama;War': 384,
         'Drama;Romance;Musical': 22,
         'Action;Mystery': 67,
         'Comedy;Crime':

In [95]:
prof_df.kind.isna().sum()

3266

In [96]:
prof_df.genres.isna().sum()

5707

In [97]:
prof_df.countries.isna().sum()

110112

In [None]:
profession_media_df.drop