# Mortality Group Weekly Report - 10/31/2020
Rustom Ichhaporia

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.metrics import auc
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
sns.set()

In [2]:
df_raw = pd.read_csv('data/11.csv').drop(['smok100', 'agesmk', 'smokstat', 'smokhome', 'curruse', 'everuse'], axis=1)
df_raw

Unnamed: 0,record,age,race,sex,ms,hisp,adjinc,educ,pob,wt,...,vt,histatus,hitype,povpct,stater,rcow,tenure,citizen,health,indalg
0,88426,70,1.0,2,5.0,3.0,11.0,4.0,909,151,...,0.0,,,18,16,4.0,1.0,,,1.0
1,88427,79,1.0,2,2.0,3.0,11.0,4.0,909,132,...,0.0,,,18,16,3.0,1.0,,,
2,88428,34,1.0,1,1.0,3.0,8.0,4.0,909,155,...,0.0,,,10,16,1.0,2.0,,,1.0
3,88429,32,1.0,2,1.0,3.0,8.0,1.0,909,155,...,0.0,,,10,16,1.0,2.0,,,1.0
4,88430,2,1.0,2,,3.0,8.0,,909,145,...,,,,10,16,,2.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1835067,666,19,1.0,1,5.0,2.0,4.0,8.0,909,60,...,0.0,1.0,4.0,6,16,2.0,2.0,,1.0,
1835068,667,33,1.0,2,1.0,2.0,11.0,6.0,909,56,...,0.0,1.0,4.0,10,16,,2.0,1.0,1.0,
1835069,668,16,1.0,2,5.0,2.0,11.0,6.0,909,60,...,0.0,1.0,4.0,10,16,,2.0,,1.0,
1835070,669,7,1.0,2,,2.0,11.0,,909,51,...,,1.0,4.0,10,16,,2.0,1.0,1.0,


Because this data takes a long time to compile and verify from multiple smaller studies, the creators included both a definite and an algorithmic indicator of death to speed up the release of the data. These features, `'inddea'` and `'indalg'`, respectively, have been combined through intersection into one target variable, `'indmort'` in the way that the reference manual recommends. The expanded list of features is below. 

In [3]:
numerical = ['age', 'hhnum']
uneven_numerical = ['adjinc', 'health', 'follow']
categorical = ['race', 'sex', 'ms', 'hisp', 'educ', 'pob', 'hhid', 'reltrf', 'occ', 'majocc', 'ind', 'esr', 'urban', 'smsast', 'inddea', 'cause113', 'dayod', 'hosp', 'hospd', 'ssnyn', 'vt', 'histatus', 'hitype', 'povpct', 'stater', 'rcow', 'tenure', 'citizen', 'indalg']
smoking = ['smok100', 'agesmk', 'smokstat', 'smokhome', 'curruse', 'everuse']
misc = ['record', 'wt']


<pre>List of all the features and their full names is pasted below. For the full description of the features, refer to the read_pubfile5.dat file. 
@1 record     $   7.     /*Record Number (page no. 6)                              */
    @8 age            2.     /*Age at Time of Interview (page no. 10)                  */
    @10 race      $   1.     /*Race  (page no.12)                                      */
    @11 sex       $   1.     /*Sex   (page no.10)                                      */
    @12 ms        $   1.     /*Marital Status (page no.13)                             */
    @13 hisp      $   1.     /*Hispanic Origin (page no. 12)                           */
    @14 adjinc    $   2.     /*Inflation Adjusted Income (page no.20)                  */
    @16 educ      $   2.     /*Highest Grade Completed (page no.14)                    */
    @18 pob       $   3.     /*Region of Birth (page no. 11)                           */
    @21 wt            4.     /*Adjusted Weight (page no. 6 )                           */
    @25 hhid      $   7.     /*Household ID No. (page no. 6)                           */
    @32 hhnum         2.     /*Number of People in HH (page no. 14)                    */
    @34 reltrf    $   1.     /*Relationship to Reference Person (page no.13)           */
    @35 occ       $   4.     /*4 Digit Occupation Code (page no. 18)                   */
    @39 majocc    $   2.     /*Major Occupation Code (page no. 18 )                    */
    @41 ind       $   4.     /*4 Digit Industry Code (page no. 17)                     */
    @45 majind    $   2.     /*Major Industry Code (page no. 18)                       */
    @47 esr       $   1.     /*Employment Status Recode (page no. 17)                  */
    @48 urban     $   1.     /*Urban/Rural Status (page no. 8)                         */
    @49 smsast    $   1.     /*SMSAST Status (page no.9)                               */
    @50 inddea    $   1.     /*Death Indicator (page no. 23)                           */
    @51 cause113  $   3.     /*Cause of Death (page no. 23)                            */
    @54 follow        4.     /*Length of Follow-up (page no. 24)                       */
    @58 dayod     $   1.     /*Day of Week of Death (page no. 24)                      */
    @59 hosp      $   1.     /*Hospital Type (page no.25)                              */
    @60 hospd     $   1.     /*Hospital Death Indicator (page no. 26)                  */
    @61 ssnyn     $   1.     /*Presence of SSN (page no. 7)                            */
    @62 vt        $   1.     /*Veteran Status (page no. 16)                            */
    @63 histatus  $   1.     /*Health Insurance Status (page no. 22)                   */
    @64 hitype    $   1.     /*Health Insurance Type (page no. 22)                     */
    @65 povpct    $   2.     /*Income as Percent of Poverty Level (page no. 21)        */
    @67 stater    $   2.     /*State Recode (page no. 8)                               */
    @69 rcow      $   2.     /*Recoded Class of Worker (page no.19)                    */
    @71 tenure    $   1.     /*Housing Tenure (page no. 14)                            */
    @72 citizen   $   1.     /*Citizenship (page no. 16)                               */
    @73 health    $   2.     /*Health (page no. 16)                                    */
    @75 indalg        1.     /*Indicator of Algorithmic Death (page no. 27)            */
    @76 smok100   $   1.     /*Smoked More than 100 Cigarettes (page no. 28)           */
    @77 agesmk    $   2.     /*Age Started Smoking (page no. 28)                       */
    @79 smokstat  $   1.     /*Cigarette Smoking Status (page no.28)                   */
    @80 smokhome  $   1.     /*Rules for Smoking Cigarettes in the Home (page no. 29 ) */
    @81 curruse   $   5.     /*Currently Use Smokeless Tobacco (page no. 30)           */
    @86 everuse   $   5.     /*Ever Use Smokeless Tobacco (page no. 31)                */</pre>

In [4]:
# indmort is the recommended combination feature of both confirmed deaths and computer-predicted deaths based on the data collection agency
df_raw['indmort'] = df_raw['inddea'][(df_raw['inddea'] == 1) & (df_raw['indalg'] == 1)]
df_raw['indmort'] = df_raw['indmort'].fillna(0)
df_raw['indmort'].sum()

94107.0

In [5]:
# Selection of fewer variables for EDA purposes
# Remove cause113 because it is not predictive
used_features = ['age', 'hhnum', 'adjinc', 'health', 'occ', 'ind', 'esr', 'ms', 'indmort']
df = df_raw[used_features]

In [6]:
df.isna().sum() / df.shape[0]

age        0.000000
hhnum      0.000000
adjinc     0.024124
health     0.790674
occ        0.466099
ind        0.466219
esr        0.191220
ms         0.196846
indmort    0.000000
dtype: float64

In [7]:
mort_corr = df.corr()['indmort'].sort_values()
mort_corr

hhnum     -0.169388
adjinc    -0.098752
ms        -0.073020
ind       -0.010118
occ        0.004227
esr        0.195555
health     0.282516
age        0.336753
indmort    1.000000
Name: indmort, dtype: float64

Correlations can be useful for numerical features, but most of these features are categorical, so I decided to begin one-hot encoding and imputation of missing values. Most of the health rating entries are still missing. 

In [8]:
df = df.astype({'occ':'category', 'ind': 'category', 'esr': 'category', 'ms': 'category'})
df.dtypes

age           int64
hhnum         int64
adjinc      float64
health      float64
occ        category
ind        category
esr        category
ms         category
indmort     float64
dtype: object

In [9]:
# df = pd.get_dummies(df, dummy_na=True)

I still am not really sure what the best way for me to impute the values is as I have not really been able to discover this from looking at the reports of other groups. However, after doing some of my own research, I found that 

In [10]:
df

Unnamed: 0,age,hhnum,adjinc,health,occ,ind,esr,ms,indmort
0,70,2,11.0,,2630.0,5470.0,1.0,5.0,0.0
1,79,2,11.0,,4700.0,5470.0,1.0,2.0,0.0
2,34,3,8.0,,8960.0,2980.0,1.0,1.0,0.0
3,32,3,8.0,,8960.0,5470.0,1.0,1.0,0.0
4,2,3,8.0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...
1835067,19,2,4.0,1.0,4760.0,4770.0,1.0,5.0,0.0
1835068,33,6,11.0,1.0,,,5.0,1.0,0.0
1835069,16,6,11.0,1.0,,,5.0,5.0,0.0
1835070,7,6,11.0,1.0,,,,,0.0


In [11]:
df_raw['citizen'].value_counts(dropna=False)

NaN    1370211
1.0     406059
5.0      34765
4.0      16009
3.0       4167
2.0       3861
Name: citizen, dtype: int64

In [12]:
df_raw['hosp'].value_counts(dropna=False)

NaN    1677278
1.0      91819
4.0      34968
2.0      25142
5.0       5308
6.0        387
3.0        170
Name: hosp, dtype: int64

In [13]:
(df_raw.isna().sum() / df_raw.shape[0]).sort_values()

record      0.000000
stater      0.000000
povpct      0.000000
ssnyn       0.000000
follow      0.000000
cause113    0.000000
inddea      0.000000
hhnum       0.000000
hhid        0.000000
indmort     0.000000
pob         0.000000
age         0.000000
sex         0.000000
wt          0.000000
race        0.001575
reltrf      0.002547
urban       0.007107
smsast      0.007115
tenure      0.013867
adjinc      0.024124
hisp        0.028610
educ        0.191202
esr         0.191220
ms          0.196846
vt          0.215823
histatus    0.314846
hitype      0.314846
rcow        0.465219
majocc      0.466099
occ         0.466099
majind      0.466219
ind         0.466219
citizen     0.746680
health      0.790674
indalg      0.809948
dayod       0.912401
hosp        0.914012
hospd       0.920889
dtype: float64

In [14]:
# indmort is the recommended combination feature of both confirmed deaths and computer-predicted deaths based on the data collection agency
df_raw['indmort'] = df_raw['inddea'][(df_raw['inddea'] == 1) & (df_raw['indalg'] == 1)]
df_raw['indmort'] = df_raw['indmort'].fillna(0)
df_raw['indmort'].sum()

94107.0

In [15]:
numerical = ['age', 'hhnum', 'povpct']
uneven_numerical = ['adjinc', 'health', 'follow']
categorical = ['race', 'sex', 'ms', 'hisp', 'educ', 'pob', 'hhid', 'reltrf', 'occ', 'majocc', 'ind', 'esr', 'urban', 'smsast', 'inddea', 'cause113', 'dayod', 'hosp', 'hospd', 'ssnyn', 'vt', 'histatus', 'hitype', 'povpct', 'stater', 'rcow', 'tenure', 'citizen', 'indalg']
smoking = ['smok100', 'agesmk', 'smokstat', 'smokhome', 'curruse', 'everuse']
misc = ['record', 'wt']

used_features = ['age', 'sex', 'stater', 'povpct', 'pob', 'race', 'urban', 'ms', 'adjinc', 'educ', 'indmort']#, 'educ', 'stater']#, 'wt']
df = df_raw[used_features]
df.dtypes

age          int64
sex          int64
stater       int64
povpct       int64
pob          int64
race       float64
urban      float64
ms         float64
adjinc     float64
educ       float64
indmort    float64
dtype: object

In [16]:
df = df.astype({'sex':'category', 'stater': 'category', 'pob': 'category', 'race': 'category', 'urban': 'category', 'ms': 'category'})
df = df.astype({'educ':'category', 'stater':'category'})
df.dtypes

age           int64
sex        category
stater     category
povpct        int64
pob        category
race       category
urban      category
ms         category
adjinc      float64
educ       category
indmort     float64
dtype: object

In [17]:
df['indmort'].value_counts(normalize=True)

0.0    0.948718
1.0    0.051282
Name: indmort, dtype: float64

In [18]:
# df_alive = df[df.indmort == 0.0]
# df_dead = df[df.indmort == 1.0]
# df = pd.concat([resample(df_alive, n_samples=df_dead.shape[0]*5), df_dead])
# df

In [19]:
df.shape

(1835072, 11)

In [20]:
(df.isna().sum() / df.shape[0]).sort_values()

age        0.000000
sex        0.000000
stater     0.000000
povpct     0.000000
pob        0.000000
indmort    0.000000
race       0.001575
urban      0.007107
adjinc     0.024124
educ       0.191202
ms         0.196846
dtype: float64

In [21]:
df_dropped = df.dropna(axis=0)
df_dropped = pd.get_dummies(df_dropped)

In [22]:
df_dropped.shape

(1443262, 146)

In [23]:
y = df_dropped['indmort']
X = df_dropped.drop(columns=['indmort'])
print('Dead / alive ratio before SMOTE:', y.sum() / y.shape[0])

Dead / alive ratio before SMOTE: 0.06430156132427792


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [25]:
# Only use oversampling for the train dataset, use regular distribution for testing
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
print('Dead / alive ratio after SMOTE:', y_train.sum() / y_train.shape[0])

Dead / alive ratio after SMOTE: 0.5


In [26]:
# X_wt = X_train['wt']
# X_train = X_train.drop(columns=['wt'])
# X_test = X_test.drop(columns=['wt'])

In [27]:
model = LGBMClassifier(is_unbalance=True, max_depth=20)
model.fit(X_train, y_train)

LGBMClassifier(is_unbalance=True, max_depth=20)

In [28]:
y_pred = model.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.92      0.94    337434
         1.0       0.32      0.57      0.41     23382

    accuracy                           0.89    360816
   macro avg       0.64      0.74      0.67    360816
weighted avg       0.93      0.89      0.91    360816



In [31]:
roc_auc_score(y_test, y_pred)
# metrics.auc(fpr, tpr)

0.7416796734394876

The model is still performing very poorly. Some other attempted parameters I attempted included sample_weight, adding extra features to model, removing SMOTE, using scale_pos_weight instead of SMOTE, etc. I am not sure how to improve the performance from here. Changing the amount of minority oversampling does not seem to help model accuracy, as the f1-score of the death class is still never higher than 0.45. I considered consolidating the job types, but it did not seem useful. I am unsure what features I should change to increase accuracy. I apologize for the lack of improvement in the model and slow progress. 

In [23]:
# average_precision_score(y_test, y_pred) # sample_weight=X_test.wt

In [None]:
# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01],# , 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     # "min_samples_split": np.linspace(0.1, 0.5, 12),
#     # "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[5],#[3,5,8],
#     # "max_features":["log2","sqrt"],
#     # "criterion": ["roc"],# ["friedman_mse",  "mae"],
#     "criterion": ["friedman_mse"],
#     # "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[100]
# }

# model = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

# model.fit(X_train, y_train)