In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.pipeline import Pipeline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
stroke_data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

# Exploratory Analysis

In [None]:
stroke_data.describe()

Minimum age as 0.08. Should be further checked during preprocessing. 

In [None]:
stroke_data.set_index('id', inplace = True)

In [None]:
#bi-variate analysis
plt.style.use('seaborn-dark')
sns.pairplot(stroke_data,hue='stroke',palette='Dark2');
plt.tight_layout()

Evidence of interrelationship among age, avg_glucose level and bmi. At a given bmi, higher the age, more cases of strokes. At higher age, there are more cases of strokes in people with higher avg glucose level. 

In [None]:
Stroke_plot = stroke_data['stroke'].value_counts().reset_index()
Stroke_plot.columns = ['stroke','count']

px.pie(Stroke_plot,values='count',names='stroke',template='plotly',title='Stroke')

## Correlation

In [None]:
plt.figure(figsize=(10,6))
sns.set_context(context='notebook',font_scale=1.2)
sns.heatmap(stroke_data[['age','avg_glucose_level','bmi']].corr(method='pearson'),cmap='Blues',annot=True);
plt.tight_layout()

# Preprocessing

In [None]:
#Checking the null values
stroke_data[stroke_data.isnull().any(axis=1)]

In [None]:
#dropping null values
stroke_data.dropna(inplace= True)

In [None]:
#Checking minimum values of age
stroke_data[stroke_data.age < 1]
#All ages less than 1 belong to children, which seems correct.

In [None]:
#separating the target column
y = stroke_data.iloc[:,-1]
x = stroke_data.iloc[:,:-1]

In [None]:
#applying one hot encoding to convert categorical data into numerical data
cat_data = x.select_dtypes(include=['object']).copy()
columns = cat_data.columns
x = pd.get_dummies(x, columns=columns)

In [None]:
#Balancing the classes of the data by first applying oversampling and then undersampling method 
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

x_ros, y_ros = pipeline.fit_resample(x, y)
#The resultant classes are now in 2:1 ratio, from earlier 95:5 ratio

In [None]:
y_ros.value_counts()

# Data split and normalization

We will be training 3 models on the data set obtained after balancing the classes

In [None]:
#splitting the dataset 
x_ros_train, x_ros_test, y_ros_train, y_ros_test = train_test_split(x_ros, y_ros, test_size = 0.6, random_state = 42)

In [None]:
#Normalizing the data
scaler = StandardScaler()
x_ros_train_scaled = scaler.fit_transform(x_ros_train) #dataset 2
x_ros_test_scaled = scaler.fit_transform(x_ros_test)   #dataset 2

# Fiting the models and comparing the results

In [None]:
from sklearn import metrics
LR = LogisticRegression().fit(x_ros_train_scaled, y_ros_train)

predict_train_LR = LR.predict(x_ros_train_scaled)
predict_test_LR = LR.predict(x_ros_test_scaled)

# accuracy score
LR_train_score = LR.score(x_ros_train_scaled,y_ros_train)
LR_test_score = LR.score(x_ros_test_scaled,y_ros_test)

# f1-score
LR_f1_score = metrics.f1_score(y_ros_test, predict_test_LR)
LR_recall = metrics.recall_score(y_ros_test, predict_test_LR)


print('Accuracy on Train set',LR_train_score)
print('Accuracy on Test set',LR_test_score)
print('F1-score on Test set:',LR_f1_score)
print('\n')
print(metrics.classification_report(y_ros_test, predict_test_LR))

In [None]:
SVM = svm.SVC().fit(x_ros_train_scaled, y_ros_train)

predict_train_SVM = SVM.predict(x_ros_train_scaled)
predict_test_SVM = SVM.predict(x_ros_test_scaled)

# accuracy score
SVM_train_score = SVM.score(x_ros_train_scaled,y_ros_train)
SVM_test_score = SVM.score(x_ros_test_scaled,y_ros_test)

# f1-score
SVM_f1_score = metrics.f1_score(y_ros_test, predict_test_SVM)
SVM_recall = metrics.recall_score(y_ros_test, predict_test_SVM)


print('Accuracy on Train set',SVM_train_score)
print('Accuracy on Test set',SVM_test_score)
print('F1-score on Test set:',SVM_f1_score)
print('\n')
print(metrics.classification_report(y_ros_test, predict_test_SVM))

In [None]:
RF = RandomForestClassifier().fit(x_ros_train_scaled, y_ros_train)

predict_train_RF = RF.predict(x_ros_train_scaled)
predict_test_RF = RF.predict(x_ros_test_scaled)

# accuracy score
RF_train_score = RF.score(x_ros_train_scaled,y_ros_train)
RF_test_score = RF.score(x_ros_test_scaled,y_ros_test)

# f1-score
RF_f1_score = metrics.f1_score(y_ros_test, predict_test_RF)
RF_recall = metrics.recall_score(y_ros_test, predict_test_RF)


print(RF.get_params())
print('Accuracy on Train set',RF_train_score)
print('Accuracy on Test set',RF_test_score)
print('F1-score on Test set:',RF_f1_score)
print('\n')
print(metrics.classification_report(y_ros_test, predict_test_RF))

# Model tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# to get best parameters

# fine Tune the model using RandomizedSearchCV

#"""
parameters= {'n_estimators':[8, 32, 64, 100, 200],
            'max_depth':[10, 12],
            'max_features':[5, 8, 10],
            'min_samples_split' : [2,4],
            'min_samples_leaf' : [1,2]}


rf = RandomForestClassifier()

rf_model_tune = RandomizedSearchCV(rf, param_distributions = parameters, cv=3,n_iter = 20, verbose=2, random_state=42)

rf_model_tune.fit(x_ros_train,y_ros_train)

In [None]:
rf_model_tune.best_params_

In [None]:

RF_model = RandomForestClassifier(n_estimators= 200,min_samples_split= 2,min_samples_leaf=1,max_features= 5,max_depth=10,bootstrap= False)

# fit the model
RF_model.fit(x_ros_train,y_ros_train)

# model score
predict_train_RF = RF_model.predict(x_ros_train)
predict_test_RF = RF_model.predict(x_ros_test)

# accuracy score
RF_train_score = RF_model.score(x_ros_train,y_ros_train)
RF_test_score = RF_model.score(x_ros_test,y_ros_test)

# f1-score
RF_f1_score = metrics.f1_score(y_ros_test,predict_test_RF)
RF_recall = metrics.recall_score(y_ros_test,predict_test_RF)
print('Accuracy on Train set',RF_train_score)
print('Accuracy on Test set',RF_test_score)
print('F1-score on Test set:',RF_f1_score)
print(metrics.classification_report(y_ros_test,predict_test_RF))


In [None]:
RF = RandomForestRegressor()
RF.fit(x_ros_train_scaled, y_ros_train)
importance = RF.feature_importances_
plt.barh(x_ros.columns, importance)

# Models comparison

In [None]:
model_compare = pd.DataFrame({


'Models':['LogisticRegression','Support Vector Machine','RandomForestClassifier'],
'f1_score':[LR_f1_score, SVM_f1_score, RF_f1_score],
'recall':[LR_recall, SVM_recall, RF_recall],
'Accuracy on train set':[LR_train_score,SVM_train_score,RF_train_score],
'Accuracy on test set':[LR_test_score, SVM_test_score,RF_test_score]

})

model_compare = model_compare.sort_values('recall',ascending=False)


In [None]:
model_compare.style.background_gradient(cmap='Greens')