In [1]:
import os
import pandas as pd
import janitor
import numpy as np
pd.set_option('display.max_columns', 100)

from utils.data_utils import pandas_to_tex

DATAPATH = "../data/individual_browsing_data_piedomains.csv"        
FIGSAVEDIR = "../figs"

df_ind = (pd.read_csv(DATAPATH)
          # Normalize seconds to hours
          .assign(
              duration_adult=lambda df: df.duration_adult/3600,
              duration_nonadult=lambda df: df.duration_nonadult/3600,
          )
         )
df_ind

Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2
0,200661421,1963,2,1,4,3,3,1,39,2,D,0.0,1728.0,1728.0,0.000000,0.0,0.000000,21.567222,77642.0,0.000000,0.0,1,1960,60,3600,4
1,200686597,1992,2,6,5,5,8,-1,48,3,,178.0,4059.0,4237.0,4.201086,,5.727778,66.535556,260148.0,7.926257,1.0,5,1990,31,961,4
2,200953869,1959,2,1,5,2,7,2,42,1,R,0.0,20.0,20.0,0.000000,1.0,0.000000,0.328333,1182.0,0.000000,0.0,1,1960,64,4096,4
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,1.712778,6.626389,30021.0,20.538956,1.0,2,1970,57,3249,3
4,201590505,1977,1,4,5,3,3,1,6,4,D,0.0,11086.0,11086.0,0.000000,0.0,0.000000,106.433611,383161.0,0.000000,0.0,4,1980,46,2116,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1802885,1935,1,1,6,4,5,2,12,3,R,,,,,1.0,,,,,,1,1940,88,7744,4
1196,1809723,1943,1,1,2,1,1,1,51,3,D,0.0,9805.0,9805.0,0.000000,0.0,0.000000,353.503611,1272613.0,0.000000,0.0,1,1940,80,6400,2
1197,1827351,1947,1,1,5,3,4,1,31,2,I,0.0,543.0,543.0,0.000000,,0.000000,6.646944,23929.0,0.000000,0.0,1,1950,76,5776,4
1198,1924249,1980,1,7,4,2,7,2,6,4,R,28.0,21533.0,21561.0,0.129864,1.0,0.171111,86.487222,311970.0,0.197455,1.0,5,1980,43,1849,4


### Distribution of visit duration on adult sites, by partisanship

In [2]:
outcome_var = "duration_adult"

In [3]:
# (fold cell) Prep data
qcut_opts = {"q":10, "precision":1, "labels": range(10,101,10), "duplicates": "drop"}

_df = (df_ind
       .query(f"{outcome_var}>0")
       .assign(**{f"decile_{outcome_var}": lambda df: pd.qcut(df[outcome_var], **qcut_opts)})
      )
print(f"N = {len(_df)}")
_df.head(3)

N = 385


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2,decile_duration_adult
1,200686597,1992,2,6,5,5,8,-1,48,3,,178.0,4059.0,4237.0,4.201086,,5.727778,66.535556,260148.0,7.926257,1.0,5,1990,31,961,4,90
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,1.712778,6.626389,30021.0,20.538956,1.0,2,1970,57,3249,3,80
5,201631749,1996,1,3,5,3,4,1,6,4,I,547.0,16921.0,17468.0,3.13144,,5.008333,118.046111,442996.0,4.070014,1.0,3,2000,27,729,4,90


In [4]:
# (fold cell) Get table of percentiles
ntiles = [.0,.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.96,.97,.98,.99,1]
_tab_R = (_df
          .query("party=='R'")
          [outcome_var].quantile(ntiles)
          .reset_index()
          .rename_column('index', 'Percentile')
          .rename_column(f'{outcome_var}', f'{outcome_var}_R')
         )
_tab_D = (_df
          .query("party=='D'")
          [outcome_var].quantile(ntiles)
          .reset_index()
          .rename_column('index', 'Percentile')
          .rename_column(f'{outcome_var}', f'{outcome_var}_D')
         )

tab_perc = (_tab_R
            .merge(_tab_D, on="Percentile", how="left", validate="1:1")
            .assign(
                **{f"{outcome_var}_R":lambda df: df[f"{outcome_var}_R"].map('{:,.1f}'.format)},
                **{f"{outcome_var}_D":lambda df: df[f"{outcome_var}_D"].map('{:,.1f}'.format)},
            )                 
           )
pandas_to_tex(tab_perc, "../tabs/piedomains_percentiles_duration_adultsites_by_individuals_by_party")
tab_perc

  tex_table = df.to_latex(index=index, header=False)


Unnamed: 0,Percentile,duration_adult_R,duration_adult_D
0,0.0,0.0,0.0
1,0.1,0.0,0.0
2,0.2,0.0,0.0
3,0.3,0.1,0.0
4,0.4,0.2,0.1
5,0.5,0.6,0.2
6,0.6,1.1,0.4
7,0.7,1.8,0.9
8,0.8,3.4,2.0
9,0.9,6.4,5.0


### Distribution of proportion of visits to adult sites, by partisanship

In [5]:
outcome_var = "prop_adult_duration"

In [6]:
# (fold cell) Prep data
qcut_opts = {"q":10, "precision":1, "labels": range(10,101,10), "duplicates": "drop"}

_df = (df_ind
       .query(f"{outcome_var}>0")
       .assign(**{f"decile_{outcome_var}": lambda df: pd.qcut(df[outcome_var], **qcut_opts)})
      )
print(f"N = {len(_df)}")
_df.head(3)

N = 385


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2,decile_prop_adult_duration
1,200686597,1992,2,6,5,5,8,-1,48,3,,178.0,4059.0,4237.0,4.201086,,5.727778,66.535556,260148.0,7.926257,1.0,5,1990,31,961,4,80
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,1.712778,6.626389,30021.0,20.538956,1.0,2,1970,57,3249,3,90
5,201631749,1996,1,3,5,3,4,1,6,4,I,547.0,16921.0,17468.0,3.13144,,5.008333,118.046111,442996.0,4.070014,1.0,3,2000,27,729,4,70


In [7]:
# (fold cell) Get table of percentiles
ntiles = [.0,.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.96,.97,.98,.99,1]
_tab_R = (_df
          .query("party=='R'")
          [outcome_var].quantile(ntiles)
          .reset_index()
          .rename_column('index', 'Percentile')
          .rename_column(f'{outcome_var}', f'{outcome_var}_R')
         )
_tab_D = (_df
          .query("party=='D'")
          [outcome_var].quantile(ntiles)
          .reset_index()
          .rename_column('index', 'Percentile')
          .rename_column(f'{outcome_var}', f'{outcome_var}_D')
         )

tab_perc = (_tab_R
            .merge(_tab_D, on="Percentile", how="left", validate="1:1")
            .assign(
                **{f"{outcome_var}_R":lambda df: df[f"{outcome_var}_R"].map('{:,.1f}'.format)},
                **{f"{outcome_var}_D":lambda df: df[f"{outcome_var}_D"].map('{:,.1f}'.format)},
            )                 
           )
pandas_to_tex(tab_perc, "../tabs/piedomainspercentiles_proportion_duration_adultsites_by_individuals_by_party")
tab_perc

  tex_table = df.to_latex(index=index, header=False)


Unnamed: 0,Percentile,prop_adult_duration_R,prop_adult_duration_D
0,0.0,0.0,0.0
1,0.1,0.0,0.0
2,0.2,0.1,0.0
3,0.3,0.2,0.1
4,0.4,0.5,0.3
5,0.5,1.1,0.7
6,0.6,3.4,1.4
7,0.7,5.4,3.0
8,0.8,10.8,7.1
9,0.9,33.4,18.2


In [8]:
# proportion of R consuming more than 1 hour
party = "R"
len(df_ind.query(f"party=='{party}'").query("duration_adult>1"))/len(df_ind.query(f"party=='{party}'").query("duration_adult==duration_adult"))

0.13513513513513514

In [9]:
# proportion of D consuming more than 1 hour
party = "D"
len(df_ind.query(f"party=='{party}'").query("duration_adult>1"))/len(df_ind.query(f"party=='{party}'").query("duration_adult==duration_adult"))

0.10379241516966067

In [10]:
# proportion of R consuming more than 1 hour for those who consume porn
party = "R"
len(df_ind.query(f"party=='{party}'").query("duration_adult>1"))/len(df_ind.query(f"party=='{party}'").query("duration_adult>0"))

0.4205607476635514

In [11]:
# proportion of D consuming more than 1 hour for those who consume porn
party = "D"
len(df_ind.query(f"party=='{party}'").query("duration_adult>1"))/len(df_ind.query(f"party=='{party}'").query("duration_adult>0"))

0.29545454545454547