In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
filepath = "../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv"

In [None]:
original_data = pd.read_csv(filepath)

In [None]:
original_data

In [None]:
original_data.describe()

In [None]:
from sklearn.utils import shuffle
original_data = shuffle(original_data)

In [None]:
original_data.head(10)

In [None]:
original_data.isnull().sum()

In [None]:
original_data['bmi'].fillna(value=original_data['bmi'].mean(), inplace=True)
original_data.head()

In [None]:
data = original_data.drop(columns = ['id','ever_married','Residence_type'],axis =1)
data.head()


Male=1
Female =0

In [None]:
data = data.replace(to_replace = 'Male', value = 1)
data = data.replace(to_replace = 'Female', value = 0)
data.head()

In [None]:
data.gender.value_counts()

Let us add 'Other' into Male

In [None]:
data = data.replace(to_replace = 'Other',value = 1)
data.gender.value_counts()

In [None]:
data["work_type"] = data["work_type"].astype("category")
data["work_type"] = data["work_type"].cat.codes
data.head()

In [None]:
data["smoking_status"] = data["smoking_status"].astype("category")
data["smoking_status"] = data["smoking_status"].cat.codes
data.head()

In [None]:
X = data.drop(columns = 'stroke',axis =1)
y = data['stroke']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 7,shuffle ='True')

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state =42)
random_forest_model.fit(X_train,y_train)

In [None]:
predict_train_data = random_forest_model.predict(X_test)

In [None]:
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_reg_model = LogisticRegression(random_state =42)
logistic_reg_model.fit(X_train,y_train)

In [None]:
predict_train_data = logistic_reg_model.predict(X_test)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=9)
knn_model.fit(X_train,y_train)

In [None]:
predict_train_data = knn_model.predict(X_test)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
from sklearn import svm
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train,y_train)

In [None]:
predict_train_data = svm_model.predict(X_test)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(str(accuracy)+"%")

In [None]:
no = y.value_counts()
print(no)

Only 249 of the total dataset is positive for stroke(4.8%)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [None]:
y_pred = random_forest_model.predict(X_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
y_pred_log = random_forest_model.predict(X_test)
cr = classification_report(y_test, y_pred_log)
print(cr)

**This happens when we have very less(<5%) diagnosed positive for stroke i.e unbalanced target variable.**

We can use SMOTE(Synthetic Minority Oversampling Technique) to increase(oversample) the target varaible.
It works by duplicating examples in the minority class.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()

x_oversample, y_oversample = smote.fit_resample(X, y)

print(y_oversample.value_counts())

Now letz train our model again

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_oversample,y_oversample, shuffle = 'True',random_state = 1)

In [None]:
smote_logistic_reg_model = LogisticRegression(random_state =0)
smote_logistic_reg_model.fit(X_train,y_train)
predict_train_data = smote_logistic_reg_model.predict(X_test)
accuracy_score(y_test,predict_train_data)


In [None]:
smote_knn_model = KNeighborsClassifier(n_neighbors=3)
smote_knn_model.fit(X_train,y_train)
predict_train_data = smote_knn_model.predict(X_test)
acc = accuracy_score(y_test,predict_train_data)
print(acc)

In [None]:
y_pred = smote_knn_model.predict(X_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
y_pred_log = smote_knn_model.predict(X_test)
cr = classification_report(y_test, y_pred_log)
print(cr)

In [None]:
smote_random_forest_model = RandomForestClassifier(random_state =3)
smote_random_forest_model.fit(X_train,y_train)
predict_train_data = smote_random_forest_model.predict(X_test)
acc = accuracy_score(y_test,predict_train_data)
accuracy = acc*100
print(accuracy)

In [None]:
y_pred = smote_random_forest_model.predict(X_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
y_pred_log = smote_random_forest_model.predict(X_test)
cr = classification_report(y_test, y_pred_log)
print(cr)

**As we can see the accuracy of models have reduced when SMOTE is applied but this going to help us with good recall score.**

Lets take Knn and RandomForest

In [None]:
pred_prob1 = smote_knn_model.predict_proba(X_test)
pred_prob2 = smote_random_forest_model.predict_proba(X_test)
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)

random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

TPR is the y-axis and FPR is the x-axis.
TPR : True Positivity Rate
FPR : False Positivity Rate

In [None]:
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])

print(auc_score1, auc_score2)

In [None]:
plt.plot(fpr1, tpr1,color='orange', label='KNN')
plt.plot(fpr2, tpr2,color='green', label='RandomForest')