In [1]:
import os
import pandas as pd
import numpy as np
import janitor
from tableone import TableOne

DATAPATH = "../../data/individual_browsing_data.csv"        

In [6]:
# (fold cell) Prep data
df_ind = (pd.read_csv(DATAPATH)
          .assign(nob=lambda df: np.where(df.duration!=df.duration, "No data", "In sample"))
          # Normalize seconds to hours
          .assign(duration_adult=lambda df: df.duration_adult/60)
          .rename_column("age", "Age")
          # Format pid7
          .case_when(
              lambda df: df["pid7"]==-1, np.nan,
              lambda df: df["pid7"]>7, np.nan,
              lambda df: df["pid7"],
              column_name="Party (7-point)"
          )
          # Format gender
          .case_when(
              lambda df: df["gender"]==1, "Male",
              lambda df: df["gender"]==2, "Female",
              np.nan,
              column_name="Gender"
          )
          # Format race2
          .case_when(
              lambda df: df["race2"]==1, "White",
              lambda df: df["race2"]==2, "Black",
              lambda df: df["race2"]==3, "Hispanic",
              lambda df: df["race2"]==4, "Asian",
              lambda df: df["race2"]==5, "Others",
              np.nan,
              column_name="Race"
          )     
          # Format educ2
          .case_when(
              lambda df: df["educ2"]==1, "No HS",
              lambda df: df["educ2"]==2, "HS",
              lambda df: df["educ2"]==3, "Some college",
              lambda df: df["educ2"]==4, "College",
              np.nan,
              column_name="Education"
          )   
          # Format region
          .case_when(
              lambda df: df["region"]==1, "Northeast",
              lambda df: df["region"]==2, "Midwest",
              lambda df: df["region"]==3, "South",
              lambda df: df["region"]==4, "West",
              np.nan,
              column_name="Region"
          )             
          # Format presvote20post
          .case_when(
              lambda df: df["presvote20post"]==1, "Vote Biden",
              lambda df: df["presvote20post"]==2, "Vote Trump",
              lambda df: df["presvote20post"]==-1, np.nan,
#               lambda df: df["presvote20post"]==6, np.nan,
              "Other/No vote",
              column_name="2020 Pres. election"
          )           
          # Format yes_visit_adults
          .case_when(
              lambda df: df["yes_visit_adults"]==1, "Yes",
              lambda df: df["yes_visit_adults"]==0, "No",
              np.nan,
              column_name="Consume porn"
          )
          # Format other outcomes
          .rename_column("duration_adult", "Minutes")
          .rename_column("prop_adult_duration", "% of time")
          .rename_column("visits_adult", "Visits")
          .rename_column("prop_adult_visits", "% of visits")
          .rename_column("party", "Party")
         )
df_ind.head(3)

Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,...,age2,educ2,nob,Party (7-point),Gender,Race,Education,Region,2020 Pres. election,Consume porn
0,200661421,1963,2,1,4,3,3,1,39,2,...,3600,4,In sample,3.0,Female,White,College,Midwest,Vote Biden,No
1,200686597,1992,2,6,5,5,8,-1,48,3,...,961,4,In sample,,Female,Others,College,South,,Yes
2,200953869,1959,2,1,5,2,7,2,42,1,...,4096,4,In sample,7.0,Female,White,College,Northeast,Vote Trump,No


In [7]:
# (fold cell) Individuals with no browsing data (dropped from sample) vs others
covariates = [
    "Party (7-point)",
    "Party",
    "2020 Pres. election",
    "Age", 
    "Gender", 
    "Race", 
    "Education", 
    "Region", 
]
tab_covariates = TableOne(
    df_ind, 
    columns=covariates, 
    smd=True,
#     htest_name=True,
    groupby="nob", 
    pval=True,
)
tab_covariates

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by nob,Grouped by nob,Grouped by nob,Grouped by nob,Grouped by nob,Grouped by nob
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,In sample,No data,P-Value,"SMD (In sample,No data)"
n,,,1200,1135,65,,
"Party (7-point), mean (SD)",,120.0,3.6 (2.2),3.6 (2.1),3.9 (2.3),0.33,0.133
"Party, n (%)",D,120.0,530 (49.1),501 (49.2),29 (47.5),0.656,0.122
"Party, n (%)",I,,194 (18.0),185 (18.2),9 (14.8),,
"Party, n (%)",R,,356 (33.0),333 (32.7),23 (37.7),,
"2020 Pres. election, n (%)",Other/No vote,170.0,270 (26.2),255 (26.3),15 (25.4),0.987,0.022
"2020 Pres. election, n (%)",Vote Biden,,419 (40.7),395 (40.7),24 (40.7),,
"2020 Pres. election, n (%)",Vote Trump,,341 (33.1),321 (33.1),20 (33.9),,
"Age, mean (SD)",,0.0,49.5 (18.1),49.6 (18.0),47.7 (20.5),0.452,-0.102
"Gender, n (%)",Female,0.0,635 (52.9),596 (52.5),39 (60.0),0.294,0.151
