In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
demographic = ['AGE', 'SEX', 'RACEA', 'REGION', 'USBORN', 'FAMSIZE', 'MARST', 'POVERTY', 'EDUCREC1']

insurace = ['HIPRIVATEE', 'HIMCAIDE', 'HIMCAREE', 'HINOTCOVE']

health = ['BMICALC', 'HSTATYR', 'FSRAWSCORE']  

smoke = ['SMOKESTATUS2', 'CIGSDAY']

alch = ['ALCSTAT1', 'ALCDAYSYR', 'ALCAMT', 'ALC5UPYR']

activity = ['MOD10DMIN', 'MOD10FWK', 'VIG10DMIN', 'VIG10FWK', 'STRONGFWK']

sleep = ['HRSLEEP']

p_health = ['CANCEREV', 'DIABETICEV', 'HEARTATTEV', 'CHEARTDIEV', 'STROKEV', 'HEARTCONEV', 'HYPERTENEV',
            'EMPHYSEMEV', 'LIVERCHRON', 'LIVERCONYR',  'KIDNEYWKYR', 'ARTHGLUPEV', 'CPOXEV', 'BLIND'] 

m_health = ['WRYRET', 'WRYMEDCST', 'AWORTHLESS', 'AEFFORT', 'AFEELINT1MO']  

tech = ['MORTSTAT']

# features to be droped after preprocessing
universe_fixers = ['ALC1YR', 'ALCLIFE', 'VISIONPROB', 'AHOPELESS', 'ANERVOUS', 'ARESTLESS', 'ASAD', 'INCIMPPOINT1', 'EARNIMPOINT1']
# features to be droped later
drop_next = ['YEAR', 'MORTDODY']

In [4]:
df_original = pd.read_csv('thesis__data_res.csv', 
                 usecols = demographic + insurace + health + smoke + alch + activity + sleep + p_health + m_health + tech +
                         universe_fixers + drop_next)

In [5]:
# use only samples tracked for mortality 
df_original = df_original[(df_original['MORTSTAT']==1) | (df_original['MORTSTAT']==2)]

In [6]:
# define TIMETOEVENT
df_original['TIMETOEVENT'] = df_original.apply(
    lambda row: row['MORTDODY'] - row['YEAR'] if row['MORTSTAT'] == 1 else 2019 - row['YEAR'], 
    axis=1)

In [7]:
# define dead as 1 and alive as 0
df_original['MORTSTAT'] = df_original['MORTSTAT'].replace(2, 0)

In [8]:
df = df_original.copy()

In [9]:
# feature prepocessing

In [10]:
# AGE

# top-code
df['AGE'] = df['AGE'].clip(upper=85)

In [11]:
# SEX

In [12]:
# RACEA

# remove unknown and NIU
df = df[(df['RACEA']<900)]

# group categories
df['RACEA'] = df['RACEA'] // 100

# write = 1
# black = 2
# asian = 4
# other = 3
df['RACEA'] = df['RACEA'].replace(3, 3)
df['RACEA'] = df['RACEA'].replace(5, 3)
df['RACEA'] = df['RACEA'].replace(6, 3)

print('number of samples', len(df))

number of samples 2091202


In [13]:
# REGION

In [14]:
# USBORN

# remove unknown and NIU
df = df[(df['USBORN']<=20)]

# group
df['USBORN'] = df['USBORN'].apply(lambda x: 10 if 10 <= x <= 12 else x)

print('number of samples', len(df))

number of samples 1260000


In [15]:
# FAMSIZE

# remove unknown and NIU
df = df[(df['FAMSIZE']<=90)]

print('number of samples', len(df))

number of samples 1260000


In [16]:
# MARST

# group
df['MARST'] = df['MARST'].apply(lambda x: 10 if 10 <= x <= 13 else x)

# remove unknown and NIU
df = df[df['MARST'].isin([10, 20, 30, 40, 50])]

print('number of samples', len(df))

number of samples 1255683


In [17]:
# EDUCREC1

# remove unknown
df = df[(df['EDUCREC1']>=1) & (df['EDUCREC1']<=16)].copy()

print('number of samples', len(df))

number of samples 1240097


In [18]:
# HIPRIVATEE

# remove unknown and NIU
df = df[(df['HIPRIVATEE']==1) | (df['HIPRIVATEE']==2)]

print('number of samples', len(df))

number of samples 1221136


In [19]:
# HIMCAIDE

# remove unknown and NIU
df.loc[(df['HIMCAIDE'] == 3), 'HIMCAIDE'] = 2
df = df[(df['HIMCAIDE']==1) | (df['HIMCAIDE']==2)]

print('number of samples', len(df))

number of samples 1221136


In [20]:
# HIMCAREE

# remove unknown and NIU
df.loc[(df['HIMCAREE'] == 3), 'HIMCAREE'] = 2
df = df[(df['HIMCAREE']==1) | (df['HIMCAREE']==2)]

print('number of samples', len(df))

number of samples 1221086


In [21]:
# HINOTCOVE

# remove unknown and NIU
df = df[(df['HINOTCOVE']==1) | (df['HINOTCOVE']==2)]

print('number of samples', len(df))

number of samples 1221086


In [22]:
# BMICALC

# fix samples with no BMI beacuse weight botom-coded (not the same as 'too low')
#condition_15 = (
#    (df['WEIGHT'] == 126) & (df['SEX'] == 1) |  
#    (df['WEIGHT'] == 99)  & (df['SEX'] == 2) & (df['YEAR'] <= 2005) |  
#    (df['WEIGHT'] == 100) & (df['SEX'] == 2) & (df['YEAR'] >= 2006)
#)

# fix samples with no BMI beacuse weight top-coded (not the same as 'too high')
#condition_50 = (
#    (df['WEIGHT'] == 285) & (df['SEX'] == 1) & (df['YEAR'] <= 2005) |  
#    (df['WEIGHT'] == 299) & (df['SEX'] == 1) & (df['YEAR'] >= 2006) |  
#    (df['WEIGHT'] == 259) & (df['SEX'] == 2) & (df['YEAR'] <= 2005) |  
#    (df['WEIGHT'] == 274) & (df['SEX'] == 2) & (df['YEAR'] >= 2006)   
#)
 

# assign values based on conditions
#df.loc[condition_15, 'BMICALC'] = 15
#df.loc[condition_50, 'BMICALC'] = 50


# remove unknown and NIU
df = df[(df['BMICALC']>0) & (df['BMICALC']<900)]

print('number of samples', len(df))

number of samples 573418


In [23]:
# HSTATYR

# remove unknown and NIU
df = df[df['HSTATYR'].isin([1, 2, 3])]

# make it ordinal
df['HSTATYR'] = df['HSTATYR'].replace(3, 5)  # about the same 
df['HSTATYR'] = df['HSTATYR'].replace(2, 3)  # worse
df['HSTATYR'] = df['HSTATYR'].replace(5, 2)  # about the same

print('number of samples', len(df))

number of samples 572892


In [24]:
# FSRAWSCORE

# remove unknown samples
df = df[df['FSRAWSCORE']<=10]

print('number of samples', len(df))

number of samples 222580


In [25]:
# SMOKESTATUS2

# remove unknown and NIU
df = df[df['SMOKESTATUS2'].isin([11, 12, 20, 30])]

# 11 = curent everyday
# 12 = curent someday
# 20 = former
# 30 = never

print('number of samples', len(df))

number of samples 222338


In [26]:
# CIGSDAY

# make NIU as 0
df['CIGSDAY'] = df['CIGSDAY'].replace(96, 0)

# remove unknown 
df = df[df['CIGSDAY']<=90]

print('number of samples', len(df))

number of samples 222109


In [27]:
# ALCSTAT1

# remove unknown and NIU
df = df[df['ALCSTAT1'].isin([1, 2, 3])]

print('number of samples', len(df))

number of samples 220609


In [28]:
# ALCDAYSYR

# fix universe
df = df[(df['ALC1YR']==1) | (df['ALC1YR']==2) | (df['ALCLIFE']==1) | (df['ALCLIFE']==2)]

# make NIU be 0
df['ALCDAYSYR'] = df['ALCDAYSYR'].replace(996, 0)

# remove unknown 
df = df[(df['ALCDAYSYR']<=365)]

print('number of samples', len(df))

number of samples 220609


In [29]:
# ALCAMT

# universed fixed in ALCDAYSYR 

# remove unknown
df= df[df['ALCAMT']<=95]

print('number of samples', len(df))

number of samples 219995


In [30]:
# ALC5UPYR

# universed fixed in ALCDAYSYR 

# make NIU be 0
df['ALC5UPYR'] = df['ALC5UPYR'].replace(996, 0)

# remove unknown 
df = df[(df['ALC5UPYR']<=365)]

print('number of samples', len(df))

number of samples 218399


In [31]:
# MOD10FWK

# make never, unable, less than once a week as 0
df['MOD10FWK'] = df['MOD10FWK'].replace({94: 0, 95: 0, 96: 0})
# remove samples wihtout data
df = df[(df['MOD10FWK']<=93)]

print('number of samples', len(df))

number of samples 215687


In [32]:
# MOD10DMIN

# remove people who refused to answer
df = df[(df['MOD10DMIN']<=996)]

print('number of samples', len(df))

number of samples 214941


In [33]:
# VIG10FWK

# make never, unable, less than once a week as 0
df['VIG10FWK'] = df['VIG10FWK'].replace({94: 0, 95: 0, 96: 0})
# remove samples wihtout data
df = df[(df['VIG10FWK']<=93)]

print('number of samples', len(df))

number of samples 214186


In [34]:
# VIG10DMIN

# remove people who refused to answer
df = df[(df['VIG10DMIN']<=996)]

print('number of samples', len(df))

number of samples 213946


In [35]:
# STRONGFWK

# make never, unable, less than once a week as 0
df['STRONGFWK'] = df['STRONGFWK'].replace({94: 0, 95: 0, 96: 0})
# remove samples wihtout data
df = df[(df['STRONGFWK']<=93)]

print('number of samples', len(df))

number of samples 213642


In [36]:
# HRSLEEP

# remove people who refused to answer
df = df[(df['HRSLEEP']>=1) & (df['HRSLEEP']<=24)]

print('number of samples', len(df))

number of samples 210305


In [37]:
# CANCEREV 

# remove unknown and NIU
df = df[df['CANCEREV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 210208


In [38]:
# DIABETICEV

# remove unknown and NIU
df = df[df['DIABETICEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 206052


In [39]:
# HEARTATTEV

# remove unknown and NIU
df = df[df['HEARTATTEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205967


In [40]:
# CHEARTDIEV

# remove unknown and NIU
df = df[df['CHEARTDIEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205763


In [41]:
# STROKEV

# remove unknown and NIU
df = df[df['STROKEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205698


In [42]:
# HEARTCONEV

# remove unknown and NIU
df = df[df['HEARTCONEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205643


In [43]:
# HYPERTENEV

# remove unknown and NIU
df = df[df['HYPERTENEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205547


In [44]:
# EMPHYSEMEV

# remove unknown and NIU
df = df[df['EMPHYSEMEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 205505


In [45]:
# LIVERCHRON

# remove unknown and NIU
df = df[df['LIVERCHRON'].isin([1, 2])]

print('number of samples', len(df))

number of samples 204931


In [46]:
# LIVERCONYR

# remove unknown and NIU
df = df[df['LIVERCONYR'].isin([1, 2])]

print('number of samples', len(df))

number of samples 204883


In [47]:
# KIDNEYWKYR

# remove unknown and NIU
df = df[df['KIDNEYWKYR'].isin([1, 2])]

print('number of samples', len(df))

number of samples 204834


In [48]:
# ARTHGLUPEV

# remove unknown and NIU
df = df[df['ARTHGLUPEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 204713


In [49]:
# CPOXEV

# remove unknown and NIU
df = df[df['CPOXEV'].isin([1, 2])]

print('number of samples', len(df))

number of samples 196953


In [50]:
# BLIND

# fix universe
df.loc[(df['VISIONPROB'] == 1), 'BLIND'] = 1

# remove unknown and NIU
df = df[df['BLIND'].isin([1, 2])]

print('number of samples', len(df))

number of samples 196919


In [51]:
# WRYRET

# remove unknown and NIU
df = df[df['WRYRET'].isin([1, 2, 3, 4])]

print('number of samples', len(df))

number of samples 144091


In [52]:
# WRYMEDCST

# remove unknown and NIU
df = df[df['WRYMEDCST'].isin([1, 2, 3, 4])]

print('number of samples', len(df))

number of samples 144010


In [53]:
# AWORTHLESS

# remove unknown and NIU
df = df[df['AWORTHLESS'].isin([0, 1, 2, 3, 4])]

print('number of samples', len(df))

number of samples 143256


In [54]:
# AEFFORT

# remove unknown and NIU
df = df[df['AEFFORT'].isin([0, 1, 2, 3, 4])]

print('number of samples', len(df))

number of samples 143164


In [55]:
# AFEELINT1MO

# fix universe
df = df[(df['AHOPELESS']<=4)]
df = df[(df['ANERVOUS']<=4)]
df = df[(df['ARESTLESS']<=4)]
df = df[(df['ASAD']<=4)]
df = df[(df['AWORTHLESS']<=4)]
df = df[(df['AEFFORT']<=4)]

# remove unknown and NIU
df = df[df['AFEELINT1MO']<=5]

# make AFEELINT1MO ordinal 
df['AFEELINT1MO'] = df['AFEELINT1MO'].replace(0, 5)

print('number of samples', len(df))

number of samples 143002


In [56]:
# POVERTY

# remove samples with undefinable POVERTY value
df = df[df['POVERTY'] != 98]

# group
df['POVERTY'] = df['POVERTY'].apply(lambda x: 1 if 10 <= x <= 14 else x)
df['POVERTY'] = df['POVERTY'].apply(lambda x: 2 if 20 <= x <= 25 else x)
df['POVERTY'] = df['POVERTY'].apply(lambda x: 3 if 31 <= x <= 32 else x)
df['POVERTY'] = df['POVERTY'].apply(lambda x: 4 if 33 <= x <= 34 else x)
df['POVERTY'] = df['POVERTY'].apply(lambda x: 5 if 35 <= x <= 36 else x)
df['POVERTY'] = df['POVERTY'].apply(lambda x: 6 if x == 37 else x)
df['POVERTY'] = df['POVERTY'].replace(38, np.nan)
df['POVERTY'] = df['POVERTY'].replace(99, np.nan)

# predict POVERTY for POVERTY = 38 and 99
model = RandomForestClassifier(n_estimators=100, criterion='log_loss', random_state=42)
# define features and target  
df_imp = df[['INCIMPPOINT1', 'EARNIMPOINT1', 'FAMSIZE', 'YEAR', 'POVERTY']]
df_train = df_imp[df['POVERTY'].notna()]
df_test = df_imp[df['POVERTY'].isna()]

X_train = df_train[['INCIMPPOINT1', 'EARNIMPOINT1', 'FAMSIZE', 'YEAR']]
y_train = df_train['POVERTY'].astype(int)
X_test = df_test[['INCIMPPOINT1', 'EARNIMPOINT1', 'FAMSIZE', 'YEAR']]

model.fit(X_train, y_train)

# predict
df_test.loc[:, 'POVERTY'] = np.round(model.predict(X_test))

# impute
df.loc[df['POVERTY'].isna(), 'POVERTY'] = df_test['POVERTY']
    
print('number of samples', len(df))

number of samples 142152


In [57]:
# define df for 3y mortaltity state prediction
df_2016 = df[df['YEAR'] <= 2016]

In [58]:
# remove universe_fixers

df = df.drop(columns=universe_fixers)
df_2016 = df_2016.drop(columns=universe_fixers)

In [59]:
# save new data 
df_2016.to_csv("df2016.csv", index=False)

In [60]:
# remove universe_fixers
df_2016 = df_2016.drop(columns=drop_next)

In [65]:
# save new data for Cox model
df_2016.to_csv("df2016cox.csv", index=False)

In [61]:
# D_{imputed}

In [62]:
print("number of samples:", len(df))
print(f"non-censored: {len(df[df['MORTSTAT'] == 1]) / len(df) * 100:.2f}%")
print(f"censored: {len(df[df['MORTSTAT'] == 0]) / len(df) * 100:.2f}%")

number of samples: 142152
non-censored: 4.64%
censored: 95.36%


In [63]:
# D_{final}

In [64]:
print("number of samples:", len(df_2016))
print(f"non-censored: {len(df_2016[df_2016['MORTSTAT'] == 1]) / len(df_2016) * 100:.2f}%")
print(f"censored: {len(df_2016[df_2016['MORTSTAT'] == 0]) / len(df_2016) * 100:.2f}%")

number of samples: 103400
non-censored: 5.56%
censored: 94.44%
