## Devise a stroke predictor model that predicts stroke chances of a patient with highest accuracy!

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df1 = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
df1

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
le = LabelEncoder()

df1['gender'] = le.fit_transform(df1.gender)
df1['ever_married'] = le.fit_transform(df1.ever_married)
df1['work_type'] = le.fit_transform(df1.work_type)
df1['Residence_type'] = le.fit_transform(df1.Residence_type)
df1['smoking_status'] = le.fit_transform(df1.smoking_status)
df1

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,,2,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [None]:
df1['bmi'] = df1.bmi.fillna(0)
df1

In [None]:
x = df1.drop(df1[['id', 'stroke']], axis = 1)
x

In [None]:
y = df1[['stroke']]
y

In [None]:
model_params = {
    'LinearRegression':{
        'model': LinearRegression(),
        'params':{
            'fit_intercept': [False, True],
            'normalize': [False, True],
            'copy_X': [False, True]
        }
    },
    'DecisionTreeClassifier':{
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,20,50,100,150,200],
            'criterion': ['gini', 'entropy'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'class_weight': ['balanced', 'balanced_subsample']
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['mse', 'friedman_mse', 'mae'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10,20,50,100,150,200],
            'criterion': ['mse', 'mae'],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'GausssianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-09, 1e-10, 1e-11, 1e-12]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [1,2,3,4,5,10],
            'fit_prior': ['false', 'true']
        }
    }
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

scores_rscv =[]
for model_name, mp in model_params.items():
    rscv_clf = RandomizedSearchCV(mp['model'], mp['params'], 
                            cv = 5,n_iter = 15,n_jobs = -1,
                            verbose = 3, return_train_score = False)

    def timer(start_time=None):
        if not start_time:
            start_time = datetime.now()
            return start_time
        elif start_time:
            thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
            tmin, tsec = divmod(temp_sec, 60)
            print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

    from datetime import datetime
    start_time = timer(None)
    rscv_clf.fit(x, y.values.ravel())
    timer(start_time)

    scores_rscv.append({
        'Model Name': model_name,
        'Best Score': rscv_clf.best_score_,
        'Best Parameter': rscv_clf.best_params_,
        'Best Estimator': rscv_clf.best_estimator_
    })
pd.set_option('display.max_colwidth',-1)
result_rscv = pd.DataFrame(scores_rscv, columns = ['Model Name', 'Best Score', 'Best Parameter','Best Estimator'])
result_rscv

##### And the winner isssssss..... Random Forest Cassifier (Expected :P)

In [None]:
classifier = RandomForestClassifier(n_estimators = 200, max_features = "log2", criterion= "gini", class_weight = "balanced")

In [None]:
# Saving model to disk
pickle.dump(classifier, open('model.pkl','wb'))