In [1]:
import pandas as pd
import janitor
import numpy as np

pd.set_option("display.max_columns", 100)
from tableone import TableOne
from utils.data_utils import tableone_to_texfrag

DATAPATH = "../data/individual_browsing_data.csv"

In [2]:
# (fold cell) Prep data
df_ind = (
    pd.read_csv(DATAPATH)
    # Normalize seconds to hours
    .assign(duration_adult=lambda df: df.duration_adult / 60)
    .rename_column("age", "Age")
    # Format pid7
    .case_when(
        lambda df: df["pid7"] == -1,
        np.nan,
        lambda df: df["pid7"] > 7,
        np.nan,
        lambda df: df["pid7"],
        column_name="Party (7-point)",
    )
    # Format gender
    .case_when(
        lambda df: df["gender"] == 1,
        "Male",
        lambda df: df["gender"] == 2,
        "Female",
        np.nan,
        column_name="Gender",
    )
    # Format race2
    .case_when(
        lambda df: df["race2"] == 1,
        "White",
        lambda df: df["race2"] == 2,
        "Black",
        lambda df: df["race2"] == 3,
        "Hispanic",
        lambda df: df["race2"] == 4,
        "Asian",
        lambda df: df["race2"] == 5,
        "Others",
        np.nan,
        column_name="Race",
    )
    # Format educ2
    .case_when(
        lambda df: df["educ2"] == 1,
        "No HS",
        lambda df: df["educ2"] == 2,
        "HS",
        lambda df: df["educ2"] == 3,
        "Some college",
        lambda df: df["educ2"] == 4,
        "College",
        np.nan,
        column_name="Education",
    )
    # Format region
    .case_when(
        lambda df: df["region"] == 1,
        "Northeast",
        lambda df: df["region"] == 2,
        "Midwest",
        lambda df: df["region"] == 3,
        "South",
        lambda df: df["region"] == 4,
        "West",
        np.nan,
        column_name="Region",
    )
    # Format presvote20post
    .case_when(
        lambda df: df["presvote20post"] == 1,
        "Vote Biden",
        lambda df: df["presvote20post"] == 2,
        "Vote Trump",
        lambda df: df["presvote20post"] == -1,
        np.nan,
        #               lambda df: df["presvote20post"]==6, np.nan,
        "Other/No vote",
        column_name="2020 Pres. election",
    )
    # Format yes_visit_adults
    .case_when(
        lambda df: df["yes_visit_adults"] == 1,
        "Yes",
        lambda df: df["yes_visit_adults"] == 0,
        "No",
        np.nan,
        column_name="Consume porn",
    )
    # Format other outcomes
    .rename_column("duration_adult", "Minutes")
    .rename_column("prop_adult_duration", "% of time")
    .rename_column("visits_adult", "Visits")
    .rename_column("prop_adult_visits", "% of visits")
    .rename_column("party", "Party")
)
df_ind.head(4)

  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,Party,Visits,visits_nonadult,visits,% of visits,rep,Minutes,duration_nonadult,duration,% of time,yes_visit_adults,race2,cohort,Age,age2,educ2,Party (7-point),Gender,Race,Education,Region,2020 Pres. election,Consume porn
0,200661421,1963,2,1,4,3,3,1,39,2,D,0.0,1728.0,1728.0,0.0,0.0,0.0,77642.0,77642.0,0.0,0.0,1,1960,60,3600,4,3.0,Female,White,College,Midwest,Vote Biden,No
1,200686597,1992,2,6,5,5,8,-1,48,3,,243.0,3994.0,4237.0,5.73519,,432.533333,234196.0,260148.0,9.97586,1.0,5,1990,31,961,4,,Female,Others,College,South,,Yes
2,200953869,1959,2,1,5,2,7,2,42,1,R,0.0,20.0,20.0,0.0,1.0,0.0,1182.0,1182.0,0.0,0.0,1,1960,64,4096,4,7.0,Female,White,College,Northeast,Vote Trump,No
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,102.766667,23855.0,30021.0,20.538956,1.0,2,1970,57,3249,3,,Female,Black,Some college,South,Vote Biden,Yes


### Splits by party

In [3]:
# (fold cell) Tabulate outcomes
outcomes = [
    "Consume porn",
    "Minutes",
    "% of time",
    "Visits",
    "% of visits",
]
tab_outcomes = TableOne(
    df_ind,
    columns=outcomes,
    smd=True,
    #     htest_name=True,
    groupby="rep",
    pval=True,
)
tableone_to_texfrag(tab_outcomes, "../tabs/balance_outcomes_by_party")
tab_outcomes

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0.0,1.0,P-Value,"SMD (0.0,1.0)"
n,,,1200,530,356,,
"Consume porn, n (%)",No,65.0,774 (68.2),343 (68.5),235 (70.6),0.569,0.046
"Consume porn, n (%)",Yes,,361 (31.8),158 (31.5),98 (29.4),,
"Minutes, mean (SD)",,65.0,73.4 (342.1),58.8 (331.7),75.8 (277.4),0.423,0.056
"% of time, mean (SD)",,65.0,3.4 (11.2),2.9 (10.7),3.5 (11.1),0.486,0.049
"Visits, mean (SD)",,65.0,74.3 (328.9),59.9 (298.9),73.7 (271.1),0.489,0.048
"% of visits, mean (SD)",,65.0,2.2 (7.1),1.7 (6.1),2.3 (7.1),0.238,0.085


In [4]:
# (fold cell) Tabulate covariates
covariates = [
    "Party (7-point)",
    "2020 Pres. election",
    "Age",
    "Gender",
    "Race",
    "Education",
    "Region",
]
tab_covariates = TableOne(
    df_ind,
    columns=covariates,
    smd=True,
    #     htest_name=True,
    groupby="rep",
    pval=True,
)
tableone_to_texfrag(tab_covariates, "../tabs/balance_covariates_by_party")
tab_covariates

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0.0,1.0,P-Value,"SMD (0.0,1.0)"
n,,,1200,530,356,,
"Party (7-point), mean (SD)",,120.0,3.6 (2.2),1.7 (0.8),6.3 (0.8),<0.001,5.67
"2020 Pres. election, n (%)",Other/No vote,170.0,270 (26.2),97 (20.2),47 (14.1),<0.001,3.296
"2020 Pres. election, n (%)",Vote Biden,,419 (40.7),369 (76.9),8 (2.4),,
"2020 Pres. election, n (%)",Vote Trump,,341 (33.1),14 (2.9),278 (83.5),,
"Age, mean (SD)",,0.0,49.5 (18.1),48.7 (17.8),55.4 (18.0),<0.001,0.373
"Gender, n (%)",Female,0.0,635 (52.9),312 (58.9),174 (48.9),0.004,0.201
"Gender, n (%)",Male,,565 (47.1),218 (41.1),182 (51.1),,
"Race, n (%)",Asian,0.0,49 (4.1),31 (5.8),6 (1.7),<0.001,0.747
"Race, n (%)",Black,,152 (12.7),96 (18.1),7 (2.0),,


### Splits by whether they consumed porn

In [5]:
# (fold cell) Tabulate outcomes
tab_outcomes = TableOne(
    df_ind,
    columns=outcomes[1:],
    smd=True,
    #     htest_name=True,
    groupby="Consume porn",
    pval=True,
)
tableone_to_texfrag(tab_outcomes, "../tabs/balance_outcomes_by_porn_consumers")
tab_outcomes

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,No,Yes,P-Value,"SMD (No,Yes)"
n,,,1200,774,361,,
"Minutes, mean (SD)",,65.0,73.4 (342.1),0.0 (0.0),230.8 (576.3),<0.001,0.566
"% of time, mean (SD)",,65.0,3.4 (11.2),0.0 (0.0),10.6 (17.9),<0.001,0.833
"Visits, mean (SD)",,65.0,74.3 (328.9),0.0 (0.0),233.5 (550.8),<0.001,0.599
"% of visits, mean (SD)",,65.0,2.2 (7.1),0.0 (0.0),6.9 (11.2),<0.001,0.87


In [6]:
# (fold cell) Tabulate covariates
covariates = [
    "Party (7-point)",
    "Party",
    "2020 Pres. election",
    "Age",
    "Gender",
    "Race",
    "Education",
    "Region",
]
tab_covariates = TableOne(
    df_ind,
    columns=covariates,
    smd=True,
    #     htest_name=True,
    groupby="Consume porn",
    pval=True,
)
tableone_to_texfrag(tab_covariates, "../tabs/balance_covariates_by_porn_consumers")
tab_covariates

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn,Grouped by Consume porn
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,No,Yes,P-Value,"SMD (No,Yes)"
n,,,1200,774,361,,
"Party (7-point), mean (SD)",,120.0,3.6 (2.2),3.6 (2.2),3.6 (2.1),0.580,-0.037
"Party, n (%)",D,120.0,530 (49.1),343 (49.4),158 (48.8),0.226,0.115
"Party, n (%)",I,,194 (18.0),117 (16.8),68 (21.0),,
"Party, n (%)",R,,356 (33.0),235 (33.8),98 (30.2),,
"2020 Pres. election, n (%)",Other/No vote,170.0,270 (26.2),145 (22.1),110 (34.9),<0.001,0.287
"2020 Pres. election, n (%)",Vote Biden,,419 (40.7),281 (42.8),114 (36.2),,
"2020 Pres. election, n (%)",Vote Trump,,341 (33.1),230 (35.1),91 (28.9),,
"Age, mean (SD)",,0.0,49.5 (18.1),51.3 (18.2),46.1 (17.1),<0.001,-0.295
"Gender, n (%)",Female,0.0,635 (52.9),487 (62.9),109 (30.2),<0.001,0.695


### Medians

In [7]:
def tableone_to_texfrag(tableone, texfile):
    tex_table = tableone.tabulate(tablefmt="latex")
    # line #1 = \begin{tabular}...
    # line #2 = headers..
    # line #3 = \hline
    # last line = \end{tabular}
    # 2nd last line = \hline
    tex_table_fragment = "\n".join(tex_table.split("\n")[4:-2])
    # Remove the last \\ in the tex fragment to prevent the annoying
    # "Misplaced \noalign" LaTeX error when I use \bottomrule
    tex_table_fragment = tex_table_fragment[:-2]

    # Replace all occurrences of "& &" with "&"
    import re
    tex_table_fragment = re.sub(r'&\s+&', ' & ', tex_table_fragment)

    # Save
    if texfile.split(".")[-1] != ".tex":
        texfile += ".tex"
    with open(texfile, "w") as tf:
        tf.write(tex_table_fragment)
    return None

In [8]:
# (fold cell) Tabulate outcomes
outcomes = [
    "Minutes",
    "% of time",
    "Visits",
    "% of visits",
]
tab_outcomes = TableOne(
    df_ind,
    columns=outcomes,
    nonnormal=outcomes,  # Kruskal-Wallis test
    htest_name=False,
    groupby="rep",
    pval=True,
)
tableone_to_texfrag(tab_outcomes, "../tabs/balance_outcomes_by_party_medians")
tab_outcomes

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep,Grouped by rep
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0.0,1.0,P-Value
n,,,1200,530,356,
"Minutes, median [Q1,Q3]",,65.0,"0.0 [0.0,4.8]","0.0 [0.0,3.1]","0.0 [0.0,3.6]",0.981
"% of time, median [Q1,Q3]",,65.0,"0.0 [0.0,0.1]","0.0 [0.0,0.1]","0.0 [0.0,0.1]",0.842
"Visits, median [Q1,Q3]",,65.0,"0.0 [0.0,8.0]","0.0 [0.0,6.0]","0.0 [0.0,8.0]",0.933
"% of visits, median [Q1,Q3]",,65.0,"0.0 [0.0,0.2]","0.0 [0.0,0.1]","0.0 [0.0,0.2]",0.916
