In [35]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange, notebook
from collections import Counter
from time import sleep
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import re

In [21]:
doctor_df = pd.read_csv("../data/analysis_data/media_data/profession/doctor.csv", index_col=None)

In [22]:
doctor_df.head()

Unnamed: 0,year,kind,Genre:Action,Genre:Adult,Genre:Adventure,Genre:Animation,Genre:Biography,Genre:Comedy,Genre:Crime,Genre:Documentary,...,Country:Yugoslavia,Country:Zambia,Country:Zimbabwe,n_titles,n_total_mentions,n_mentions,n_pos_mentions,n_neg_mentions,freq,sentiment
0,1950,episode,0,0,0,0,0,0,0,0,...,0,0,0,2,18462,8,1,0,0.000433,1.0
1,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,7,71729,6,0,1,8.4e-05,0.0
2,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,1,7720,10,0,0,0.001295,0.0
3,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,1,648,0,0,0,0.0,0.0
4,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,3,27058,23,3,0,0.00085,1.0


In [40]:
doctor_df.columns = list(map(lambda x: re.sub("\W", "_", x), doctor_df.columns))

In [41]:
genre_columns = list(filter(lambda x: x.startswith("Genre"), doctor_df.columns))
country_columns = list(filter(lambda x: x.startswith("Country"), doctor_df.columns))

In [42]:
genre_columns

['Genre_Action',
 'Genre_Adult',
 'Genre_Adventure',
 'Genre_Animation',
 'Genre_Biography',
 'Genre_Comedy',
 'Genre_Crime',
 'Genre_Documentary',
 'Genre_Drama',
 'Genre_Family',
 'Genre_Fantasy',
 'Genre_Film_Noir',
 'Genre_Game_Show',
 'Genre_History',
 'Genre_Horror',
 'Genre_Music',
 'Genre_Musical',
 'Genre_Mystery',
 'Genre_News',
 'Genre_Reality_TV',
 'Genre_Romance',
 'Genre_Sci_Fi',
 'Genre_Short',
 'Genre_Sport',
 'Genre_Talk_Show',
 'Genre_Thriller',
 'Genre_War',
 'Genre_Western']

In [43]:
country_columns

['Country_Afghanistan',
 'Country_Albania',
 'Country_Algeria',
 'Country_American_Samoa',
 'Country_Andorra',
 'Country_Angola',
 'Country_Argentina',
 'Country_Armenia',
 'Country_Aruba',
 'Country_Australia',
 'Country_Austria',
 'Country_Azerbaijan',
 'Country_Bahamas',
 'Country_Bahrain',
 'Country_Bangladesh',
 'Country_Belarus',
 'Country_Belgium',
 'Country_Benin',
 'Country_Bermuda',
 'Country_Bhutan',
 'Country_Bolivia',
 'Country_Bosnia_and_Herzegovina',
 'Country_Botswana',
 'Country_Brazil',
 'Country_British_Virgin_Islands',
 'Country_Bulgaria',
 'Country_Burkina_Faso',
 'Country_Burma',
 'Country_Burundi',
 'Country_Cambodia',
 'Country_Cameroon',
 'Country_Canada',
 'Country_Cayman_Islands',
 'Country_Chad',
 'Country_Chile',
 'Country_China',
 'Country_Colombia',
 'Country_Congo',
 'Country_Costa_Rica',
 'Country_Croatia',
 'Country_Cuba',
 'Country_Cyprus',
 'Country_Czech_Republic',
 'Country_Czechoslovakia',
 'Country_Côte_d_Ivoire',
 'Country_Denmark',
 'Country_Do

In [44]:
doctor_df.head()

Unnamed: 0,year,kind,Genre_Action,Genre_Adult,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Country_Yugoslavia,Country_Zambia,Country_Zimbabwe,n_titles,n_total_mentions,n_mentions,n_pos_mentions,n_neg_mentions,freq,sentiment
0,1950,episode,0,0,0,0,0,0,0,0,...,0,0,0,2,18462,8,1,0,0.000433,1.0
1,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,7,71729,6,0,1,8.4e-05,0.0
2,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,1,7720,10,0,0,0.001295,0.0
3,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,1,648,0,0,0,0.0,0.0
4,1950,movie,0,0,0,0,0,0,0,0,...,0,0,0,3,27058,23,3,0,0.00085,1.0


In [45]:
formula = "freq ~ year + kind + " + " + ".join("C({})".format(genre) for genre in genre_columns) + " + " + " + ".join("C({})".format(country) for country in country_columns)

In [47]:
model = smf.glm(formula = formula, data = doctor_df, family = sm.families.Binomial(), var_weights = doctor_df.n_total_mentions)

In [48]:
result = model.fit()

In [49]:
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:                43408
Model:                            GLM   Df Residuals:                    43192
Model Family:                Binomial   Df Model:                          215
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.9156e+06
Date:                Thu, 03 Jun 2021   Deviance:                   4.6520e+05
Time:                        15:44:45   Pearson chi2:                 8.17e+05
No. Iterations:                    32                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [50]:
result.aic

5831701.296242083

In [51]:
result.bic

3975.6234499410493

In [52]:
result.params

Intercept                     17.450373
kind[T.movie]                 -0.359000
kind[T.tv mini series]        -0.444761
kind[T.tv movie]              -0.170026
kind[T.tv series]             -0.348517
                                ...    
C(Country_Yemen)[T.1]        -25.524087
C(Country_Yugoslavia)[T.1]     0.145589
C(Country_Zambia)[T.1]       -25.750032
C(Country_Zimbabwe)[T.1]      -0.618585
year                          -0.012558
Length: 216, dtype: float64

In [53]:
model2 = smf.glm(formula = formula, data = doctor_df, family = sm.families.Binomial())

In [54]:
result2 = model2.fit()

In [55]:
print(result2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:                43408
Model:                            GLM   Df Residuals:                    43192
Model Family:                Binomial   Df Model:                          215
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -124.55
Date:                Thu, 03 Jun 2021   Deviance:                       34.418
Time:                        15:56:26   Pearson chi2:                     69.4
No. Iterations:                    23                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [56]:
result.pvalues

Intercept                      0.000000e+00
kind[T.movie]                  0.000000e+00
kind[T.tv mini series]        4.508608e-112
kind[T.tv movie]               3.536256e-63
kind[T.tv series]             9.527007e-105
                                  ...      
C(Country_Yemen)[T.1]          9.998471e-01
C(Country_Yugoslavia)[T.1]     3.883142e-05
C(Country_Zambia)[T.1]         9.998595e-01
C(Country_Zimbabwe)[T.1]       3.821689e-01
year                           0.000000e+00
Length: 216, dtype: float64

In [60]:
coeff = pd.concat([result.params, result.pvalues], axis = 1)
coeff.columns = ["coefficient", "pvalue"]

In [61]:
coeff.head()

Unnamed: 0,coefficient,pvalue
Intercept,17.450373,0.0
kind[T.movie],-0.359,0.0
kind[T.tv mini series],-0.444761,4.508608e-112
kind[T.tv movie],-0.170026,3.536256e-63
kind[T.tv series],-0.348517,9.527007e-105


In [63]:
pd.set_option("display.max_rows", 500)
coeff[coeff.pvalue < 0.05].sort_values(by = "coefficient")

Unnamed: 0,coefficient,pvalue
C(Country_Georgia)[T.1],-1.678664,3.951039e-05
C(Country_Jamaica)[T.1],-1.432489,0.01310637
C(Genre_Reality_TV)[T.1],-1.424785,0.0
C(Country_Bangladesh)[T.1],-1.25947,0.001147582
C(Country_Montenegro)[T.1],-1.221011,0.03559537
C(Country_Vietnam)[T.1],-1.195413,9.858299e-06
C(Country_Isle_Of_Man)[T.1],-1.157982,0.0001233123
C(Genre_Game_Show)[T.1],-1.14858,1.181097e-144
C(Country_Kazakhstan)[T.1],-1.004659,1.851715e-05
C(Country_Guadeloupe)[T.1],-0.961508,1.258523e-09


In [64]:
non_significant_coefficients = coeff.index[coeff.pvalue > 0.05]

In [66]:
for c in non_significant_coefficients:
    print(c.rstrip("[T.1]"))

C(Genre_Film_Noir)
C(Genre_War)
C(Country_American_Samoa)
C(Country_Andorra)
C(Country_Angola)
C(Country_Armenia)
C(Country_Azerbaijan)
C(Country_Bahrain)
C(Country_Belarus)
C(Country_Belgium)
C(Country_Benin)
C(Country_Bhutan)
C(Country_Botswana)
C(Country_British_Virgin_Islands)
C(Country_Bulgaria)
C(Country_Burma)
C(Country_Burundi)
C(Country_Cameroon)
C(Country_Chad)
C(Country_Congo)
C(Country_Costa_Rica)
C(Country_Czechoslovakia)
C(Country_Côte_d_Ivoire)
C(Country_El_Salvador)
C(Country_Estonia)
C(Country_Faroe_Islands)
C(Country_French_Polynesia)
C(Country_Gabon)
C(Country_Ghana)
C(Country_Greenland)
C(Country_Guinea)
C(Country_Guinea_Bissau)
C(Country_Hungary)
C(Country_Iraq)
C(Country_Israel)
C(Country_Jordan)
C(Country_Kenya)
C(Country_Kosovo)
C(Country_Kuwait)
C(Country_Kyrgyzstan)
C(Country_Laos)
C(Country_Latvia)
C(Country_Lebanon)
C(Country_Liberia)
C(Country_Libya)
C(Country_Madagascar)
C(Country_Malaysia)
C(Country_Malta)
C(Country_Martinique)
C(Country_Mauritania)
C(Cou

In [67]:
formula

'freq ~ year + kind + C(Genre_Action) + C(Genre_Adult) + C(Genre_Adventure) + C(Genre_Animation) + C(Genre_Biography) + C(Genre_Comedy) + C(Genre_Crime) + C(Genre_Documentary) + C(Genre_Drama) + C(Genre_Family) + C(Genre_Fantasy) + C(Genre_Film_Noir) + C(Genre_Game_Show) + C(Genre_History) + C(Genre_Horror) + C(Genre_Music) + C(Genre_Musical) + C(Genre_Mystery) + C(Genre_News) + C(Genre_Reality_TV) + C(Genre_Romance) + C(Genre_Sci_Fi) + C(Genre_Short) + C(Genre_Sport) + C(Genre_Talk_Show) + C(Genre_Thriller) + C(Genre_War) + C(Genre_Western) + C(Country_Afghanistan) + C(Country_Albania) + C(Country_Algeria) + C(Country_American_Samoa) + C(Country_Andorra) + C(Country_Angola) + C(Country_Argentina) + C(Country_Armenia) + C(Country_Aruba) + C(Country_Australia) + C(Country_Austria) + C(Country_Azerbaijan) + C(Country_Bahamas) + C(Country_Bahrain) + C(Country_Bangladesh) + C(Country_Belarus) + C(Country_Belgium) + C(Country_Benin) + C(Country_Bermuda) + C(Country_Bhutan) + C(Country_Boliv

In [71]:
new_formula = formula + " - " + " - ".join(c.rstrip("[T.1]") for c in non_significant_coefficients)

In [72]:
new_formula

'freq ~ year + kind + C(Genre_Action) + C(Genre_Adult) + C(Genre_Adventure) + C(Genre_Animation) + C(Genre_Biography) + C(Genre_Comedy) + C(Genre_Crime) + C(Genre_Documentary) + C(Genre_Drama) + C(Genre_Family) + C(Genre_Fantasy) + C(Genre_Film_Noir) + C(Genre_Game_Show) + C(Genre_History) + C(Genre_Horror) + C(Genre_Music) + C(Genre_Musical) + C(Genre_Mystery) + C(Genre_News) + C(Genre_Reality_TV) + C(Genre_Romance) + C(Genre_Sci_Fi) + C(Genre_Short) + C(Genre_Sport) + C(Genre_Talk_Show) + C(Genre_Thriller) + C(Genre_War) + C(Genre_Western) + C(Country_Afghanistan) + C(Country_Albania) + C(Country_Algeria) + C(Country_American_Samoa) + C(Country_Andorra) + C(Country_Angola) + C(Country_Argentina) + C(Country_Armenia) + C(Country_Aruba) + C(Country_Australia) + C(Country_Austria) + C(Country_Azerbaijan) + C(Country_Bahamas) + C(Country_Bahrain) + C(Country_Bangladesh) + C(Country_Belarus) + C(Country_Belgium) + C(Country_Benin) + C(Country_Bermuda) + C(Country_Bhutan) + C(Country_Boliv

In [73]:
model3 = smf.glm(formula = new_formula, data = doctor_df, family = sm.families.Binomial(), var_weights = doctor_df.n_total_mentions)
result3 = model3.fit()

In [74]:
print(result3.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:                43408
Model:                            GLM   Df Residuals:                    43285
Model Family:                Binomial   Df Model:                          122
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.9158e+06
Date:                Thu, 03 Jun 2021   Deviance:                   4.6551e+05
Time:                        16:28:17   Pearson chi2:                 8.18e+05
No. Iterations:                    13                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------

In [75]:
coeff3 = pd.concat([result3.params, result3.pvalues], axis = 1)
coeff3.columns = ["coefficient", "pvalue"]

In [76]:
coeff3.head()

Unnamed: 0,coefficient,pvalue
Intercept,17.509118,0.0
kind[T.movie],-0.359865,0.0
kind[T.tv mini series],-0.448466,4.423779e-115
kind[T.tv movie],-0.170569,5.156798e-64
kind[T.tv series],-0.349991,6.052222000000001e-106


In [80]:
coeff3[coeff3.pvalue < 0.05].sort_values(by = "coefficient")

Unnamed: 0,coefficient,pvalue
C(Country_Georgia)[T.1],-1.689491,3.513174e-05
C(Country_Jamaica)[T.1],-1.431773,0.01315175
C(Genre_Reality_TV)[T.1],-1.423627,0.0
C(Country_Bangladesh)[T.1],-1.313822,0.0005600414
C(Country_Montenegro)[T.1],-1.215706,0.03638527
C(Country_Vietnam)[T.1],-1.186197,1.055318e-05
C(Country_Isle_Of_Man)[T.1],-1.165276,0.0001114562
C(Genre_Game_Show)[T.1],-1.147845,1.2335840000000001e-144
C(Country_Kazakhstan)[T.1],-0.986512,2.196755e-05
C(Country_Guadeloupe)[T.1],-0.968353,9.457557e-10


In [83]:
doctor_df2 = doctor_df[(doctor_df.n_titles >= 30) & (doctor_df.n_pos_mentions + doctor_df.n_neg_mentions > 0)]

In [90]:
doctor_df2.var().index[doctor_df2.var() > 0]

Index(['year', 'Genre_Action', 'Genre_Adventure', 'Genre_Animation',
       'Genre_Biography', 'Genre_Comedy', 'Genre_Crime', 'Genre_Documentary',
       'Genre_Drama', 'Genre_Family', 'Genre_Fantasy', 'Genre_Game_Show',
       'Genre_History', 'Genre_Horror', 'Genre_Music', 'Genre_Mystery',
       'Genre_News', 'Genre_Reality_TV', 'Genre_Romance', 'Genre_Sci_Fi',
       'Genre_Short', 'Genre_Sport', 'Genre_Thriller', 'Genre_War',
       'Genre_Western', 'Country_Canada', 'Country_United_Kingdom',
       'Country_United_States', 'n_titles', 'n_total_mentions', 'n_mentions',
       'n_pos_mentions', 'n_neg_mentions', 'freq', 'sentiment'],
      dtype='object')