In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

In [None]:
df = df = pd.read_csv(r'../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

## About the data

- id: unique identifier 
- gender: "Male", "Female" or "Other"
- age: age of the patient
- hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
- heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
- ever_married: "No" or "Yes"
- work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
- Residence_type: "Rural" or "Urban"
- avg_glucose_level: average glucose level in blood
- bmi: body mass index
- smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
- stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient



In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

- id columns is irrelavent
- the average age is 43
- average glucose level is 106
- minimun age is questionable

### checking for null values

In [None]:
df.isnull().sum()

### Creating a heatmap

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.isnull(), cmap='Blues', yticklabels = False)

### id column in not required

In [None]:
df.drop('id', axis = 1, inplace = True)

### filling NaN values in bmi by its mean

In [None]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.isnull(), cmap='viridis', yticklabels = False)

### one more to check for missing values is using the missingno library

In [None]:
import missingno as msno

msno.matrix(df)

###  The bar chart gives you an idea about how many missing values are there in each column

In [None]:
msno.bar(df, sort = 'descending')

### understanding the data distrubition using pairplot

In [None]:
import seaborn as sns
sns.pairplot(data=df, hue= 'stroke' )

### Categorical plotting

In [None]:
fig, ax = plt.subplots(4,2, figsize = (15,15))
plt.suptitle("Count plots for categorical values")
sns.countplot(ax=ax[0,0], x='gender',data=df)
sns.countplot(ax=ax[0,1], x='hypertension',data=df)
sns.countplot(ax=ax[1,0], x='heart_disease',data=df)
sns.countplot(ax=ax[1,1], x='ever_married',data=df)
sns.countplot(ax=ax[2,0],data=df,x='work_type')
sns.countplot(ax=ax[2,1],data=df,x='Residence_type')
sns.countplot(ax=ax[3,0],data=df,x='smoking_status')
sns.countplot(ax=ax[3,1],data=df,x='stroke')


### Categorical plotting

In [None]:
sns.catplot(y="work_type", hue="stroke", kind="count",
            palette="Blues",data=df)

In [None]:
sns.catplot(y="Residence_type", hue="stroke", kind="count",
            palette="Blues",data=df)

In [None]:
sns.catplot(y="smoking_status", hue="stroke", kind="count",
            palette="Blues",data=df)

In [None]:
sns.catplot(y="heart_disease", hue="stroke", kind="count",
            palette="Blues",data=df)

In [None]:
fig, ax = plt.subplots(2,2, figsize = (15,15))
#plt.suptitle("Count plots for categorical values")
sns.histplot(ax=ax[0,0], x='age',kde = True, data=df)
sns.histplot(ax=ax[0,1], x='avg_glucose_level',kde=True, data=df)
sns.histplot(ax=ax[1,0], x='bmi',kde = True, data=df)
sns.histplot(ax=ax[1,1], x='stroke',kde = True, data=df)


### dropping other from gender

In [None]:
df.drop(df[df['gender'] == 'Other'].index, inplace = True)
df['gender'].unique()

### checking for outliers

In [None]:
plt.figure(figsize = (10,4))
sns.set_theme(style='whitegrid')
sns.boxplot(x= df['avg_glucose_level'], palette='Pastel2')


In [None]:
plt.figure(figsize = (10,4))
sns.set_theme(style='whitegrid')
sns.boxplot(x= df['bmi'], palette='Pastel1')

### encoding our categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)
df_encoded.head()

In [None]:
from sklearn.preprocessing import StandardScaler
features = ['gender','age','hypertension','heart_disease','ever_married', 'work_type', 'Residence_type', 'avg_glucose_level','bmi', 'smoking_status']
ft_to_scale = ['age', 'work_type', 'avg_glucose_level', 'bmi', 'smoking_status']
scaler = StandardScaler()
df_encoded[ft_to_scale] = scaler.fit_transform(df_encoded[ft_to_scale])

In [None]:
df_encoded.head()

In [None]:
# df_encoded['gender'] = le.fit_transform(df_encoded['gender'])
# df_encoded['ever_married'] = le.fit_transform(df_encoded['ever_married'])
# df_encoded['work_type'] = le.fit_transform(df_encoded['work_type'])
# df_encoded['Residence_type'] = le.fit_transform(df_encoded['Residence_type'])
# df_encoded['smoking_status'] = le.fit_transform(df_encoded['smoking_status'])

In [None]:
df_encoded.head()

In [None]:
df.head()

### Correlation

In [None]:
plt.figure(figsize = (20,10))
corr = df.corr()
sns.heatmap(corr, xticklabels=corr, yticklabels=corr.columns, annot=True, cmap= 'Blues')

In [None]:
df_encoded.describe()

### Splitting and resampling the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_encoded.drop('stroke', axis = 1)
y = df_encoded['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

### Reason for resampling

- the number of people not having a stroke is much larger than the number of people having a stroke
- if we dont resample the dataour model will predict that all cases do not have stroke
- to overcome this we either undersample or oversample

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state = 2)
X_train_res,y_train_res = sm.fit_resample(X_train,y_train)

In [None]:
y_train.value_counts()

In [None]:
y_train_res.value_counts()

## Implementing various algorithms

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_res,y_train_res)
rfc_pred = rfc.predict(X_test)
print("Train score: {}" .format(rfc.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,rfc_pred)))
print(classification_report(y_test, rfc_pred))
sns.heatmap(confusion_matrix(y_test,rfc_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_res,y_train_res)
lr_pred = lr.predict(X_test)
print("Train score: {}" .format(lr.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,lr_pred)))
print(classification_report(y_test, lr_pred))
sns.heatmap(confusion_matrix(y_test,lr_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_res,y_train_res)
knn_pred = knn.predict(X_test)
print("Train score: {}" .format(knn.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,knn_pred)))
print(classification_report(y_test, knn_pred))
sns.heatmap(confusion_matrix(y_test,knn_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_res,y_train_res)
gnb_pred = gnb.predict(X_test)
print("Train score: {}" .format(gnb.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,gnb_pred)))
print(classification_report(y_test, gnb_pred))
sns.heatmap(confusion_matrix(y_test,gnb_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from sklearn.svm import SVC
svc = SVC()

svc.fit(X_train_res,y_train_res)
svc_pred = svc.predict(X_test)
print("Train score: {}" .format(svc.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,svc_pred)))
print(classification_report(y_test, svc_pred))
sns.heatmap(confusion_matrix(y_test,svc_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from sklearn.tree import DecisionTreeClassifier
      
dtc = DecisionTreeClassifier()

dtc.fit(X_train_res,y_train_res)
dtc_pred = dtc.predict(X_test)
print("Train score: {}" .format(dtc.score(X_train_res,y_train_res)))
print("Accuracy score: {}" .format(accuracy_score(y_test,dtc_pred)))
print(classification_report(y_test, dtc_pred))
sns.heatmap(confusion_matrix(y_test,dtc_pred), annot = True, cmap='Blues',fmt = 'd')

In [None]:
from matplotlib import pyplot as plt
from sklearn import metrics

In [None]:
fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (15,15))

clf = [rfc,lr,gnb,knn,svc,dtc]

for cls,ax in zip(clf,axes.flatten()):
    metrics.plot_confusion_matrix(cls, X_test, y_test, ax=ax, cmap = 'Blues')
    ax.title.set_text(type(cls).__name__)
plt.tight_layout()
plt.show()