In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import time
from imblearn.over_sampling import SMOTE
import warnings

from sklearn.preprocessing import LabelEncoder,RobustScaler,StandardScaler
from sklearn.model_selection import GridSearchCV,StratifiedKFold,cross_val_score,RandomizedSearchCV

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,plot_confusion_matrix,classification_report,confusion_matrix
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [3]:
df_train=pd.read_csv('train_2v.csv')
df_train['bmi'].fillna(value=np.median(df_train['bmi'].dropna()),inplace=True)

def under18(cols):
    c1=cols[0]
    c2=cols[1]
    if c1<18 and c2:
        return 'never smoked'
    else:
        return cols[1]
    
df_train['smoking_status']=df_train[['age','smoking_status']].apply(under18,axis=1)
df_train.smoking_status.fillna("unknown",inplace=True)

bins=[0,18,40,60,99]
# labels=['age_18','age_18_40','age_40_60','age_60_99']
labels=['children','young_adult','adult','elderly']
df_train['age']=pd.cut(df_train.age,bins=bins,labels=labels)

In [4]:
df_train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,children,0,0,No,children,Rural,95.12,18.0,never smoked,0
1,30468,Male,adult,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,children,0,0,No,Private,Urban,110.89,17.6,never smoked,0
3,56543,Female,elderly,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,children,0,0,No,Never_worked,Rural,161.28,19.1,never smoked,0


In [5]:
df=df_train.copy()

### 1.approach->creating dummy variables
### 2.approach->Label encode and leave them



In [7]:
dum=[ 'gender', 'age', 'ever_married','work_type', 'Residence_type','smoking_status']
to_dum=pd.get_dummies(df[dum],drop_first=True)
to_dum.head()

df_dum=pd.concat([df,to_dum],axis=1).drop(dum+['id'],axis=1)
df_dum.head()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,age_young_adult,age_adult,age_elderly,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes,smoking_status_unknown
0,0,0,95.12,18.0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
1,1,0,87.96,39.2,0,1,0,0,1,0,1,0,1,0,0,1,1,0,0
2,0,0,110.89,17.6,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
3,0,0,69.04,35.9,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0
4,0,0,161.28,19.1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [8]:
# le_gen=LabelEncoder()
# le_wt=LabelEncoder()
# le_res=LabelEncoder()
# le_smo=LabelEncoder()
# le_ev=LabelEncoder()
enc=LabelEncoder()
df_en=df.copy()

# df_en['gender']=le_gen.fit_transform(df.gender)
# df_en['work_type']=le_wt.fit_transform(df.work_type)
# df_en['Residence_type']=le_res.fit_transform(df.Residence_type)
# df_en['smoking_status']=le_smo.fit_transform(df.smoking_status)
# df_en['ever_married']=le_smo.fit_transform(df.ever_married)

for col in df.columns:
    if isinstance(df_en[col][0], str):
        df_en[col]=enc.fit_transform(df_en[col])
        


In [9]:
df_num=df[['bmi','avg_glucose_level']]
sc=StandardScaler()
std_sc=sc.fit_transform(df_num)
std_sc=pd.DataFrame(std_sc,columns=['bmi','avg_glucose_level'])

In [10]:
df_dum_sc=pd.concat([df_dum.drop(['bmi','avg_glucose_level'],axis=1),std_sc],axis=1)
# df_dum_sc.drop(['age','bmi','avg_glucose_level'],axis=1,inplace=True)

df_en_sc=pd.concat([df_en.drop(['bmi','avg_glucose_level','id'],axis=1),std_sc],axis=1)
# df_en_sc.drop(['age','bmi','avg_glucose_level'],axis=1,inplace=True)

In [11]:
x_dum=df_dum_sc.drop('stroke',axis=1)
y_dum=df_dum.stroke

x_en=df_en_sc.drop('stroke',axis=1)
y_en=df_en.stroke

In [12]:
x_dum_tr,x_dum_te,y_dum_tr,y_dum_te=train_test_split(x_dum,y_dum,test_size=0.2,random_state=100,stratify=y_dum)
x_en_tr,x_en_te,y_en_tr,y_en_te=train_test_split(x_en,y_en,test_size=0.2,random_state=100,stratify=y_en)

In [13]:
y_dum_te.value_counts(normalize=True)
y_dum_te.value_counts(normalize=True)

y_en_te.value_counts(normalize=True)
y_en_te.value_counts(normalize=True)

0    0.981912
1    0.018088
Name: stroke, dtype: float64

In [14]:
log=LogisticRegression(class_weight='balanced')
model_log=log.fit(x_dum_tr,y_dum_tr)
print('Cut_off at 0.5','\n','*'*75)

print(classification_report(y_dum_te,model_log.predict(x_dum_te),target_names=['No Stroke','Stroke']))

Cut_off at 0.5 
 ***************************************************************************
              precision    recall  f1-score   support

   No Stroke       0.99      0.73      0.84      8523
      Stroke       0.05      0.77      0.09       157

    accuracy                           0.73      8680
   macro avg       0.52      0.75      0.47      8680
weighted avg       0.98      0.73      0.83      8680



In [15]:
log=LogisticRegression(class_weight='balanced')
model_log=log.fit(x_dum_tr,y_dum_tr)
pred=model_log.predict_proba(x_dum_te)[:,1]

f1=[]
cut_off=np.linspace(0.1,0.9,15)
for j in cut_off:
    pred_num=[1 if i>j else 0 for i in pred]
    f1.append(f1_score(y_dum_te,pred_num))
thresh_log=pd.DataFrame({'cut_off':cut_off,
                    'f1_score':f1})   


In [16]:
thresh_log

Unnamed: 0,cut_off,f1_score
0,0.1,0.054922
1,0.157143,0.062079
2,0.214286,0.062551
3,0.271429,0.062982
4,0.328571,0.066108
5,0.385714,0.077843
6,0.442857,0.090846
7,0.5,0.094939
8,0.557143,0.099958
9,0.614286,0.106294


In [17]:
pred_cutoff=[1 if i>0.72 else 0 for i in pred]
print('Cut_off at 0.671 (from above thresh)','\n','*'*75)
print(classification_report(y_dum_te,pred_cutoff))

Cut_off at 0.671 (from above thresh) 
 ***************************************************************************
              precision    recall  f1-score   support

           0       0.99      0.88      0.93      8523
           1       0.07      0.52      0.13       157

    accuracy                           0.87      8680
   macro avg       0.53      0.70      0.53      8680
weighted avg       0.97      0.87      0.92      8680



In [18]:
starcv=StratifiedKFold(n_splits=5,random_state=100)
log_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

log_grid = GridSearchCV(LogisticRegression(class_weight='balanced'), log_params,scoring='roc_auc',cv=starcv)
log_grid.fit(x_dum_tr, y_dum_tr)
log_hyper=log_grid.best_estimator_

In [19]:
cross_val_score(log_hyper,x_dum_tr,y_dum_tr,scoring='roc_auc').mean()

0.8318434209925382

In [20]:
print('Cut_off at 0.5','\n','*'*75)

print(classification_report(y_dum_te,log_hyper.predict(x_dum_te),target_names=['No Stroke','Stroke']))

Cut_off at 0.5 
 ***************************************************************************
              precision    recall  f1-score   support

   No Stroke       0.99      0.73      0.84      8523
      Stroke       0.05      0.77      0.09       157

    accuracy                           0.73      8680
   macro avg       0.52      0.75      0.47      8680
weighted avg       0.98      0.73      0.83      8680

