In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
%matplotlib inline
import seaborn as sns
from time import time
from scipy import stats
from scipy.stats import randint as sp_randint
from imblearn.pipeline import make_pipeline, Pipeline
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('train_ajEneEa.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
id                   43400 non-null int64
gender               43400 non-null object
age                  43400 non-null float64
hypertension         43400 non-null int64
heart_disease        43400 non-null int64
ever_married         43400 non-null object
work_type            43400 non-null object
Residence_type       43400 non-null object
avg_glucose_level    43400 non-null float64
bmi                  41938 non-null float64
smoking_status       30108 non-null object
stroke               43400 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


In [3]:
def clean_data(df):
    
    df.smoking_status.fillna('unknown', inplace = True)
    df.gender = df.gender.map(lambda x: 0 if x == 'Female' else 1)
    df.bmi.fillna(df.bmi.median(), inplace=True)
    df.ever_married= df.ever_married.map(lambda x: 0 if x == 'No' else 1)
    df = pd.get_dummies(df, columns= ['smoking_status','work_type'])
    
    df.Residence_type = df.Residence_type.map(lambda x: 1 if x =='Urban' else 0)
    
    return df


In [4]:
df = clean_data(df)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 19 columns):
id                                43400 non-null int64
gender                            43400 non-null int64
age                               43400 non-null float64
hypertension                      43400 non-null int64
heart_disease                     43400 non-null int64
ever_married                      43400 non-null int64
Residence_type                    43400 non-null int64
avg_glucose_level                 43400 non-null float64
bmi                               43400 non-null float64
stroke                            43400 non-null int64
smoking_status_formerly smoked    43400 non-null uint8
smoking_status_never smoked       43400 non-null uint8
smoking_status_smokes             43400 non-null uint8
smoking_status_unknown            43400 non-null uint8
work_type_Govt_job                43400 non-null uint8
work_type_Never_worked            43400 non-null uint8
work_

In [10]:
df.heart_disease.value_counts()

0    41338
1     2062
Name: heart_disease, dtype: int64

In [8]:
df.smoking_status_unknown.value_counts()

0    30108
1    13292
Name: smoking_status_unknown, dtype: int64

In [6]:
target = df.pop('stroke')

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(['id'], axis=1), target,
                                                  test_size = .1,
                                                  random_state = 42)


## Smote to oversample minority class, and random forest classifier

In [31]:
#regular smote, and rf this will give me a score of .759
sm = SMOTE(random_state=12, ratio = 'minority', n_jobs=-1)#try different algorithms (borderline1, borderline 2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)


rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train_res, y_train_res)


probs = rf.predict_proba(X_val)
predictions = rf.predict(X_val)

roc_auc_score(y_val, probs[:,1]), roc_auc_score(y_val, predictions)

In [41]:
test = pd.read_csv('test_v2akXPA.csv')
test = clean_data(test)

test_probas = rf.predict_proba(test.drop(['id'], axis=1))

In [46]:
submission = pd.DataFrame({'id':test.id,
              'stroke':test_probas[:,1]})
submission.to_csv('probas_submission.csv', index=False, index_label=None)


## SMOTE with svm algorithm to create minority samples, and then random forest classifier

In [None]:
2

In [9]:
sm = SMOTE(ratio='minority', kind='svm', n_jobs=-1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [11]:
## with svm smote algorithm
X_train_borderline, y_train_borderline = sm.fit_sample(X_train, y_train)#variable names need to me changed

rf_borderline = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf_borderline.fit(X_train_borderline, y_train_borderline)

probs = rf_borderline.predict_proba(X_val)
roc_auc_score(y_val, probs[:,1])

#roc_auc_score(y_val, probs[:,0])

0.7478183081043537

In [14]:
test = pd.read_csv('test_v2akXPA.csv')
test = clean_data(test)

test.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,smoking_status_unknown,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,36306,1,80.0,0,0,1,1,83.84,21.1,1,0,0,0,0,0,1,0,0
1,61829,0,74.0,0,1,1,0,179.5,26.0,1,0,0,0,0,0,0,1,0
2,14152,0,14.0,0,0,0,0,95.16,21.2,0,0,0,1,0,0,0,0,1
3,12997,1,28.0,0,0,0,1,94.76,23.4,0,0,0,1,0,0,1,0,0
4,40801,0,63.0,0,0,1,0,83.57,27.6,0,1,0,0,1,0,0,0,0


In [15]:
test_probas = rf_borderline.predict_proba(test.drop(['id'], axis=1))

rf_svm_predictions = pd.DataFrame({'id':test.id,
                         'stroke':test_probas[:,1]})

rf_svm_predictions.to_csv('smote_svm_submission.csv', index=False, index_label=None)

## Lets RandomSearchCv the RandomForest Classifier

In [31]:
clf_rf = RandomForestClassifier(n_jobs=-1)
smote_enn = SMOTEENN(smote = sm)#we created an sm object with svm algorithm
pipeline = Pipeline([('smote_enn', smote_enn),
                     ('clf_rf', clf_rf)])

# specify parameters and distributions to sample from
param_dist = {"clf_rf__n_estimators": sp_randint(10,1000),
              "clf_rf__max_depth": [3, None],
              "clf_rf__max_features": sp_randint(1, 11),
              "clf_rf__min_samples_split": sp_randint(2, 11),
              "clf_rf__min_samples_leaf": sp_randint(1, 11),
              "clf_rf__bootstrap": [True, False],
              "clf_rf__criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(pipeline, param_distributions = param_dist,
                                   n_iter = n_iter_search,
                                   scoring = 'roc_auc' )

start = time()
random_search.fit(df.drop(['id'], axis = 1), target)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
#report(random_search.cv_results_)

RandomizedSearchCV took 1474.22 seconds for 20 candidates parameter settings.


NameError: name 'report' is not defined

In [36]:
#random_search.cv_results_

In [40]:
random_search.best_params_

{'clf_rf__bootstrap': True,
 'clf_rf__criterion': 'entropy',
 'clf_rf__max_depth': 3,
 'clf_rf__max_features': 9,
 'clf_rf__min_samples_leaf': 4,
 'clf_rf__min_samples_split': 3,
 'clf_rf__n_estimators': 829}

In [34]:
test_probas = random_search.predict_proba(test.drop(['id'], axis=1))

rs_predictions = pd.DataFrame({'id':test.id,
                         'stroke':test_probas[:,1]})

rs_predictions.to_csv('randomsearchcv_submission.csv', index=False, index_label=None)

In [35]:
rs_predictions.head()

Unnamed: 0,id,stroke
0,36306,0.681182
1,61829,0.774926
2,14152,0.004214
3,12997,0.005095
4,40801,0.301565


In [33]:
import pickle
with open('randomsearchcv.pickle', 'wb') as handle:
    pickle.dump(random_search, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Lets use XGBoost

In [42]:
from xgboost import XGBClassifier

In [63]:
X_val.values

array([[ 0., 54.,  0., ...,  1.,  0.,  0.],
       [ 0., 19.,  0., ...,  1.,  0.,  0.],
       [ 0., 27.,  0., ...,  1.,  0.,  0.],
       ...,
       [ 1.,  5.,  0., ...,  0.,  0.,  1.],
       [ 1., 34.,  0., ...,  1.,  0.,  0.],
       [ 0., 48.,  0., ...,  1.,  0.,  0.]])

In [64]:
#BEST SCORE SO FAR
xgb_clf = XGBClassifier(n_jobs = -1, )
xgb_clf.fit(X_train_borderline, y_train_borderline)
probas = xgb_clf.predict_proba(X_val.values)
roc_auc_score(y_val.values, probas[:,1])

0.8271470307639504

In [66]:
test_values = test.drop(['id'], axis=1).values
test_probas = xgb_clf.predict_proba(test_values)

xgb_predictions = pd.DataFrame({'id':test.id,
                         'stroke':test_probas[:,1]})

xgb_predictions.to_csv('xgboost_submission.csv', index=False, index_label=None)

In [10]:
np.linspace(.5,.9,10)

array([0.5       , 0.54444444, 0.58888889, 0.63333333, 0.67777778,
       0.72222222, 0.76666667, 0.81111111, 0.85555556, 0.9       ])

In [11]:
#lets put this in a pipeline.

# pipeline = Pipeline([('smote_enn', smote_enn),
#                      ('xgb_clf', xgb_clf)])

# specify parameters and distributions to sample from
# sm = SMOTE(ratio='minority', kind='svm', n_jobs=-1)
# X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

xgb_clf = XGBClassifier(n_jobs = -1)

param_dist = {"n_estimators": sp_randint(100,1000),
              "max_depth": sp_randint(3, 10),
              "learning_rate": stats.uniform(0.01, 0.6),
              "colsample_bytree": np.linspace(.5,.9,10),
              "min_child_weight": sp_randint(1,6) 
             }

n_iter_search = 10
random_search = RandomizedSearchCV(xgb_clf, param_distributions = param_dist,
                                   n_iter = n_iter_search,
                                   scoring = 'roc_auc',
                                   n_jobs=1)

start = time()
random_search.fit(X_train_res, y_train_res)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

RandomizedSearchCV took 527.52 seconds for 10 candidates parameter settings.


In [12]:
random_search.best_params_

{'colsample_bytree': 0.6777777777777778,
 'learning_rate': 0.28090313429809166,
 'max_depth': 9,
 'min_child_weight': 2,
 'n_estimators': 271}

In [15]:
test_values = test.drop(['id'], axis=1).values
test_probas = random_search.predict_proba(test_values)

xgb_predictions = pd.DataFrame({'id':test.id,
                         'stroke':test_probas[:,1]})

xgb_predictions.to_csv('xgbmedian_submission.csv', index=False, index_label=None)

In [None]:
xgb_orig = XGBClassifier()

In [16]:
submission = pd.DataFrame({'id':test.id,
              'stroke':test_probs})

In [21]:
submission.to_csv('ravi_submission.csv', index_label=None, index=False)

In [22]:
test.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,smoking_status_unknown,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
18596,67353,1,20.0,0,0,0,0,74.43,18.4,1,0,0,0,0,0,1,0,0
18597,362,1,61.0,0,0,1,0,211.55,31.6,0,0,1,0,1,0,0,0,0
18598,29839,0,79.0,0,0,1,0,125.74,29.4,0,1,0,0,0,0,1,0,0
18599,6438,1,55.0,0,0,1,0,69.46,33.8,0,1,0,0,1,0,0,0,0
18600,16770,0,38.0,0,0,0,0,91.23,24.4,0,0,0,1,0,0,1,0,0


In [23]:
submission.tail()

Unnamed: 0,id,stroke
18596,67353,0
18597,362,0
18598,29839,0
18599,6438,0
18600,16770,0


In [24]:
probas = rf_borderline.predict_proba(test)

In [41]:
hack_sub = pd.DataFrame({'id':test.id,
                         'stroke':probas[:,1]})
hack_sub.head()

Unnamed: 0,id,stroke
0,36306,0.04
1,61829,0.08
2,14152,0.0
3,12997,0.0
4,40801,0.0


In [42]:
hack_sub.stroke = hack_sub.stroke.map(lambda x: 1 if x >.3 else 0)
hack_sub.stroke.value_counts()

0    18495
1      106
Name: stroke, dtype: int64

In [43]:
hack_sub.to_csv('greaterthan3.csv', index_label=None, index=False)

In [None]:
# try downsampling instead
# try xgboost