In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve

from imblearn.over_sampling import SMOTE

sns.set()


Data successfully loaded.


In [8]:
'''Preprocessing'''

df_raw = pd.read_csv('data/11.csv')
print('Data successfully loaded.')

df_raw = df_raw.drop(columns=['smok100', 'agesmk', 'smokstat', 'smokhome', 'curruse', 'everuse'])

df_raw['indmort'] = df_raw['inddea'][(df_raw['inddea'] == 1) & (df_raw['indalg'] == 1)]
df_raw['indmort'] = df_raw['indmort'].fillna(0)

used_numerical = ['age', 'hhnum']
used_ordinal = ['povpct', 'adjinc']
used_categorical = ['stater', 'pob', 'sex', 'race', 'urban', 'smsast']
used_special = ['wt', 'indmort']

used_features = used_numerical + used_ordinal + used_categorical + used_special

df_raw = df_raw[used_features]

df_raw[used_categorical] = df_raw[used_categorical].astype('category')

df_raw = df_raw.dropna(axis=0)


Data successfully loaded.


In [3]:
df_raw

Unnamed: 0,age,hhnum,povpct,adjinc,stater,pob,sex,race,urban,smsast,wt,indmort
0,70,2,18,11.0,16,909,2,1.0,1.0,1.0,151,0.0
1,79,2,18,11.0,16,909,2,1.0,1.0,1.0,132,0.0
2,34,3,10,8.0,16,909,1,1.0,1.0,1.0,155,0.0
3,32,3,10,8.0,16,909,2,1.0,1.0,1.0,155,0.0
4,2,3,10,8.0,16,909,2,1.0,1.0,1.0,145,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1835067,19,2,6,4.0,16,909,1,1.0,1.0,1.0,60,0.0
1835068,33,6,10,11.0,16,909,2,1.0,1.0,1.0,56,0.0
1835069,16,6,10,11.0,16,909,2,1.0,1.0,1.0,60,0.0
1835070,7,6,10,11.0,16,909,2,1.0,1.0,1.0,51,0.0


In [13]:
df_raw.drop(columns=['wt']).corr()

Unnamed: 0,age,hhnum,povpct,adjinc,indmort
age,1.0,-0.459896,0.118385,-0.069988,0.340071
hhnum,-0.459896,1.0,-0.187937,0.170595,-0.170868
povpct,0.118385,-0.187937,1.0,0.895185,-0.025596
adjinc,-0.069988,0.170595,0.895185,1.0,-0.09876
indmort,0.340071,-0.170868,-0.025596,-0.09876,1.0


The correlation of the continuous variables in the data indicates that they all have some level of realistic correlation, and none of the explanatory variables are highly correlated enough to be removed. 

In [49]:
df_sample = df_raw.sample(n=100000)

In [101]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 

features = SelectKBest(chi2, k=7)
features.fit_transform(df_sample.drop(columns=['indmort', 'wt']), df_sample['indmort'])

array([[ 70.,   2.,  18., ...,  16., 909.,   1.],
       [ 79.,   2.,  18., ...,  16., 909.,   1.],
       [ 34.,   3.,  10., ...,  16., 909.,   1.],
       ...,
       [ 16.,   6.,  10., ...,  16., 909.,   1.],
       [  7.,   6.,  10., ...,  16., 909.,   1.],
       [  6.,   6.,  10., ...,  16., 909.,   1.]])

In [102]:
df_raw.iloc[:, features.get_support(indices=True)]

Unnamed: 0,age,hhnum,povpct,adjinc,stater,pob,race
0,70,2,18,11.0,16,909,1.0
1,79,2,18,11.0,16,909,1.0
2,34,3,10,8.0,16,909,1.0
3,32,3,10,8.0,16,909,1.0
4,2,3,10,8.0,16,909,1.0
...,...,...,...,...,...,...,...
1835067,19,2,6,4.0,16,909,1.0
1835068,33,6,10,11.0,16,909,1.0
1835069,16,6,10,11.0,16,909,1.0
1835070,7,6,10,11.0,16,909,1.0


In [80]:
(pd.crosstab(df_raw.pob, df_raw.indmort)[0.0] / pd.crosstab(df_raw.pob, df_raw.indmort)[1.0]).mean()

23.054359598510917

In [99]:
chi2_val = chi2(df_raw.drop(columns=['wt']), df_raw['indmort'])[0]

In [100]:
chi2_val.

array([2.74283115e+06, 4.56135281e+04, 4.19215392e+03, 2.95360254e+04,
       8.42053161e+03, 3.27541520e+05, 8.10753134e+01, 2.52861225e+02,
       3.40581206e+00, 1.85760425e-02, 1.69427600e+06])

In [None]:
df = pd.get_dummies(df_raw)

In [None]:
X = df.drop(columns=['indmort'])
y = df['indmort']

'''Sampling'''

X_train, X_test, y_train, y_test = train_test_split(X, y)

print('Proportion of data from minority class before SMOTE:', y_train.sum() / y_train.shape[0])
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print('Proportion of data from minority class after SMOTE:', y_train.sum() / y_train.shape[0])

'''Modeling'''

model = LogisticRegressionCV(scoring='roc_auc', random_state=0, n_jobs=-1, verbose=1).fit(X_train.drop(columns=['wt']), y_train, sample_weight=X_train['wt'])

print(classification_report(model.predict(X_test.drop(columns=['wt'])), y_test))

pred_probs = model.predict_proba(X_test.drop(columns=['wt']))[:, 1]

print(classification_report(np.round(pred_probs + 0.25), y_test, sample_weight=X_test['wt']))