In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.smoking_status.value_counts()

# Replacing Unknown with never smoked in smoking_status column

In [None]:
df['smoking_status'] = df['smoking_status'].replace('Unknown','never smoked')

In [None]:
df.work_type.value_counts()

In [None]:
df.columns

In [None]:
df.gender.value_counts()

# Replacing Other with Male in gender column

In [None]:
df['gender'] = df['gender'].replace('Other','Male')

## EDA

In [None]:
plt.subplots(figsize=(18,12))
sns.scatterplot(x='avg_glucose_level', y ='bmi',data=df,hue='stroke')

In [None]:
df.plot(kind="scatter", x="avg_glucose_level", y="bmi",alpha=0.7, figsize=(18,12),
c="age", cmap=plt.get_cmap("jet"), colorbar=True)
plt.xlabel('avg_glucose_level')
plt.legend()

In [None]:
# plt.subplots(figsize=(18,12))
sns.set(rc={'figure.figsize':(18,12)})

sns.displot(df, x="avg_glucose_level", y="bmi", hue="stroke", kind="kde",col='gender')

In [None]:
df_plt= df[['age','avg_glucose_level','bmi','stroke']]
# df_plt
plt.figure(figsize=(12,12))
data = pd.melt(df_plt,id_vars="stroke",
                    var_name="features",
                    value_name='value')
# data
sns.violinplot(x= 'features', y= 'value',hue= 'stroke', data = data,split=True, inner="quart")
plt.xticks(rotation = 90)

In [None]:
sns.catplot(x="hypertension", y="bmi", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="heart_disease", y="bmi", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="work_type", y="bmi", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")

In [None]:
df.smoking_status.unique()

In [None]:
sns.catplot(x="hypertension", y="avg_glucose_level", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="heart_disease", y="avg_glucose_level", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="work_type", y="avg_glucose_level", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")

In [None]:
sns.catplot(x="Residence_type", y="bmi", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="work_type", y="avg_glucose_level", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="gender", y="avg_glucose_level", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")
sns.catplot(x="work_type", y="bmi", data=df,kind='violin',hue='stroke',col='smoking_status',split=True, inner="quart")

In [None]:
df.bmi.fillna(np.mean(df.bmi),inplace=True)
df.info()

In [None]:
df.drop('id',axis=1,inplace=True)
df.head()

## Getting dummies for categorical columns

In [None]:
df_one = pd.get_dummies(df,columns=['Residence_type','work_type','smoking_status','gender'])
df_one.head()

In [None]:
df_one['ever_married'] = df_one.ever_married.replace({'Yes': 1,'No':0})
df_one

In [None]:
f, axs = plt.subplots(1,3,figsize=(15,8))
plt.subplot(1, 3, 1)
plt.hist(df_one.age)
plt.xlabel('age')
plt.subplot(1, 3, 2)
plt.hist(df_one.bmi)
plt.xlabel('bmi')
plt.subplot(1, 3, 3)
plt.hist(df_one.avg_glucose_level)
plt.xlabel('avg_glucose_level')

In [None]:
df_one.head()

In [None]:
dummy = df_one.copy()

## Feature scaling for numerical data

In [None]:
from sklearn import preprocessing
df_trans = df_one[['avg_glucose_level','bmi','age']]
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(df_trans)
df_normalized = pd.DataFrame(x_scaled)
df_normalized.rename(columns={0: "avg_glucose_level", 1: "bmi", 2: 'age'},inplace=True)
df_normalized.head()

In [None]:
f, axs = plt.subplots(1,3,figsize=(15,8))
plt.subplot(1, 3, 1)
plt.hist(df_normalized.age)
plt.xlabel('age')
plt.subplot(1, 3, 2)
plt.hist(df_normalized.bmi)
plt.xlabel('bmi')
plt.subplot(1, 3, 3)
plt.hist(df_normalized.avg_glucose_level)
plt.xlabel('avg_glucose_level')

In [None]:
df_normalized.describe()

In [None]:
dummy.drop('stroke',axis=1,inplace=True)
dummy.head()


In [None]:
df_one.drop(['avg_glucose_level','bmi','age'],axis=1,inplace=True)
df_one.head()

In [None]:
X = pd.concat([df_one,df_normalized],axis=1)
y= X[['stroke']]
X.drop('stroke',axis=1)
X.head()

In [None]:
y

In [None]:
y.value_counts()

## Training Model with RandomforestClassifier

In [None]:
from sklearn.model_selection import train_test_split

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
y_test.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier 
# using regression to get predicted data 
rf = RandomForestClassifier()#n_estimators=40, max_depth=7) 
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)


## Confusion Matrix

In [None]:
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

ac = accuracy_score(y_test,y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,fmt="d")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

recall_score(y_test, rf.predict(x_test))
precision_score(y_test, rf.predict(x_test))
# f1_score(y_test, rf.predict(x_test))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

ns_probs = [0 for _ in range(len(y_test))]
lr_probs = rf.predict_proba(x_test)
lr_probs = lr_probs[:, 1]

ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('RFC: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Dummy Classifer')
plt.plot(lr_fpr, lr_tpr, marker='.', label='RF')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
y_scores = rf.predict_proba(x_train)[:,1]
#y_scores

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

# print(precisions, recalls)

def plot_prc (precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Thresholds')
    plt.legend(loc='center left')
    plt.ylim([0,1])

plot_prc(precisions, recalls, thresholds)

## You can test the accuracy from the dataframe which was not feature scaled, the accuracy falls

In [None]:
dummy.head()