# Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import scipy.stats as stats
import statsmodels.api as sm

# Importing Data

In [8]:
raw_baseline = pd.read_csv("bethell_BASE.csv", low_memory=False)
raw_followup = pd.read_csv("bethell_FUP.csv", low_memory= False)

# Cleaning Data

In [37]:
important_columns_base = ["entity_id", "SEX_ASK_COM", "AGE_NMBR_COM", "DEP_CESD10_COM", "DEP_DPSFD_COM", "COG_REYI_SCORE_COM", "COG_REYII_SCORE_COM",
                    "COG_AFT_SCORE_1_COM", "COG_AFT_SCORE_2_COM", "COG_MAT_SCORE_COM", "FAS_A_SCORE_COM","FAS_F_SCORE_COM", "FAS_S_SCORE_COM",
                    "STP_COLTIME_SS_COM", "STP_DOTTIME_SS_COM", "STP_WORTIME_SS_COM"] 


depression = ["DEP_CESD10_COM", "DEP_DPSFD_COM"]
rey = ["COG_REYI_SCORE_COM", "COG_REYII_SCORE_COM"]
aft = ["COG_AFT_SCORE_1_COM","COG_AFT_SCORE_2_COM"]
mat = {"COG_MAT_SCORE_COM"}
fas = ["FAS_A_SCORE_COM","FAS_F_SCORE_COM", "FAS_S_SCORE_COM"]
stp = ["STP_COLTIME_SS_COM", "STP_DOTTIME_SS_COM", "STP_WORTIME_SS_COM"]
personality = ["PER_DSCR_EXT_MCQ", "PER_DSCR_CON_MCQ","PER_DSCR_EMOS_MCQ"]
social_support = ["SSA_DPTNG_COM","SSA_DPAFF_COM","SSA_DPSOC_COM", "SSA_DPEMO_COM", "SSA_DPALL_COM"]
social_part = ["SPA_PREVAC_HC_COM"]
vision_hearing = ["HRG_HRG_COM", "VIS_SGHT_COM"]

subset_base = raw_baseline[important_columns_base]

base_dict = {"entity_id": "ID",
            "SEX_ASK_COM": "Sex", 
            "AGE_NMBR_COM": "AGE", 
            "DEP_CESD10_COM": "CESD_10", 
            "DEP_DPSFD_COM": "DEP10",
            "COG_REYI_SCORE_COM": "REYI",
            "COG_REYII_SCORE_COM": "REYII",
            "COG_AFT_SCORE_1_COM": "AFT1",
            "COG_AFT_SCORE_2_COM": "AFT2",
            "COG_MAT_SCORE_COM": "MAT",
            "FAS_A_SCORE_COM": "FAS_A",
            "FAS_F_SCORE_COM": "FAS_F", 
            "FAS_S_SCORE_COM": "FAS_S",
            "STP_COLTIME_SS_COM": "STP_COLT", 
            "STP_DOTTIME_SS_COM": "STP_DOT", 
            "STP_WORTIME_SS_COM": "STP_WOR"}

newname_base = subset_base.rename(columns=base_dict)
newname_base

Unnamed: 0,ID,Sex,AGE,CESD_10,DEP10,REYI,REYII,AFT1,AFT2,MAT,FAS_A,FAS_F,FAS_S,STP_COLT,STP_DOT,STP_WOR
0,9168901,M,51,3.000000,0,7.0,5.0,25.0,27.0,39.0,11.0,10.0,15.0,14.0,10.0,13.0
1,5862419,M,51,13.000000,1,8.0,6.0,26.0,27.0,51.0,12.0,24.0,19.0,20.0,11.0,12.0
2,5622740,F,53,8.000000,0,6.0,3.0,23.0,26.0,33.0,9.0,19.0,14.0,22.0,12.0,17.0
3,2607635,M,82,2.000000,0,4.0,2.0,12.0,13.0,7.0,9.0,9.0,15.0,44.0,28.0,30.0
4,2331443,F,61,15.000000,1,6.0,3.0,17.0,18.0,12.0,11.0,13.0,10.0,19.0,11.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30092,4482683,F,51,11.000000,1,7.0,5.0,17.0,19.0,30.0,13.0,14.0,20.0,25.0,13.0,13.0
30093,9993651,F,50,0.000000,0,5.0,4.0,26.0,31.0,33.0,17.0,14.0,9.0,17.0,10.0,12.0
30094,6960314,F,73,0.000000,0,7.0,6.0,12.0,13.0,20.0,3.0,4.0,9.0,32.0,13.0,18.0
30095,3855235,F,49,5.000000,0,7.0,6.0,14.0,14.0,13.0,11.0,13.0,10.0,20.0,11.0,13.0


In [42]:
newname_base.describe()

Unnamed: 0,ID,AGE,CESD_10,DEP10,REYI,REYII,AFT1,AFT2,MAT,FAS_A,FAS_F,FAS_S,STP_COLT,STP_DOT,STP_WOR
count,30097.0,30097.0,30097.0,30097.0,29077.0,29046.0,29366.0,29366.0,28611.0,29329.0,29396.0,29315.0,29724.0,29741.0,29731.0
mean,5499097.0,62.957504,5.565726,0.207463,5.851154,4.040143,19.67115,21.412824,26.535808,11.872345,13.263097,14.024868,25.866337,11.574224,15.583364
std,2592981.0,10.250583,8.278814,0.746171,1.906392,2.164847,5.695854,6.467655,8.750134,4.670207,4.943399,5.042045,32.65306,31.05613,26.154163
min,1000087.0,45.0,-88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-888.0,-888.0,-888.0
25%,3264264.0,54.0,2.0,0.0,5.0,3.0,16.0,17.0,21.0,9.0,10.0,11.0,20.0,10.0,13.0
50%,5499457.0,62.0,4.0,0.0,6.0,4.0,19.0,21.0,27.0,12.0,13.0,14.0,25.0,12.0,15.0
75%,7742441.0,71.0,7.0,0.0,7.0,5.0,23.0,26.0,32.0,15.0,17.0,17.0,31.0,14.0,18.0
max,9999983.0,86.0,99.0,9.0,14.0,14.0,47.0,52.0,51.0,35.0,35.0,40.0,609.0,111.0,149.0


# Analysis Preparation

# Descriptive Statistics 

# Inferential Statistics