In [None]:
#import libraries
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sb

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read the dataset
dataset=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

print(dataset.shape)
dataset.head(10)

In [None]:
dataset.drop('id',axis=1)

# Missing values

In [None]:
print(dataset.isna().sum())

In [None]:
# bmi has 201 missing values
dataset['bmi'].fillna(dataset['bmi'].mean(), inplace=True)
dataset.head(10)

In [None]:
print(dataset.isna().sum())

**Our data is free of missing values now**

# Exploratory data analysis

In [None]:
# we will create a copy of our dataset for exploration 
df=dataset.copy()

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='gender',color=base_color)
plt.title('Stroke based on gender',fontsize=20)
plt.show()

**Females are slightly more prone to stroke**

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='hypertension',color=base_color)
plt.title('Stroke based on hypertension',fontsize=20)
plt.show()

**People with hypertension are less prone to stroke**

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='heart_disease',color=base_color)
plt.title('Stroke based on heart disease',fontsize=20)
plt.show()

**People with heart disease are less prone to stroke**

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='ever_married',color=base_color)
plt.title('Stroke based on marriage',fontsize=20)
plt.show()

**Married people have higher chance of stroke**

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='work_type',color=base_color)
plt.title('Stroke based on work type',fontsize=20)
plt.show()

**People in private sector are prone to stroke**

In [None]:
plt.figure(figsize=(6,3))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='Residence_type',color=base_color)
plt.title('Stroke based on residence type',fontsize=20)
plt.show()

**Their is not much difference between the counts of stroke based on residence type**

In [None]:
plt.figure(figsize=(10,5))
base_color = sb.color_palette()[0]
stroke = df.loc[df['stroke']==1]
sb.countplot(data=stroke,x='smoking_status',color=base_color)
plt.title('Stroke based on smoking_status',fontsize=20)
plt.show()

**Surprisingly the people who never smoked have higher stroke count against the common belief that somkers are prone to diseases**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, x = 'stroke', y = 'age', color = base_color)

**People above the age of 60 are prone to stroke, with the highest being around 70**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, x = 'stroke', y = 'avg_glucose_level', color = base_color)

**People with average glucose level around 80-200 are prone to stroke, with the highest near 100**

In [None]:
base_color = sb.color_palette()[0]
sb.boxplot(data = df, x = 'stroke', y = 'bmi', color = base_color)

**bmi seems to not have a impact on stroke**

In [None]:
# Plot a pair plot
plt.figure(figsize=(20,25))
sb.pairplot(df)
plt.show()

In [None]:
dataset.head(10)

# Encoding

In [None]:
# we have to encode gender, marital status, work type, residence type, smoking status
# Store the column names in one_hot_var list. 1-hot encoding of these features will be done
one_hot_var = ['work_type','Residence_type','smoking_status']

# perform 1-hot encoding on each column present i n one_hot_var
for i in one_hot_var:
    # perform 1-hot encoding for variable & store it in x_t dataframe
    x_t = pd.get_dummies(dataset[i], prefix=i, prefix_sep='_', drop_first=True) # define new column name, separator and drop 1 of them.
    
    # join x_t to dataset
    dataset = dataset.join(x_t)
    
    # drop original column from dataset as it is no longer needed
    dataset.drop(i, axis = 1, inplace = True)

In [None]:
# encoding gender and marital staus
dataset.gender=dataset.gender.apply(lambda x :1 if x=='Female' else 0)
dataset.ever_married=dataset.ever_married.apply(lambda x :1 if x=='Yes' else 0)

In [None]:
dataset.head(10)

**Split feature and target**

In [None]:
X=dataset.drop('stroke', axis=1)
y=dataset['stroke']

**Split training and testing data**

In [None]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# Save the training data in a separate df which will be used for finding coorelation
df_corr = X_train.copy()

In [None]:
# plot a heatmap of correlation in training data
plt.figure(figsize = (18,12))
sb.heatmap(df_corr.corr(),cbar=True,annot=True)
plt.show()

**No feature has a great correlation**

In [None]:
df_corr['stroke'] = y_train

In [None]:
plt.figure(figsize=(18,8))
df_corr.corr()['stroke'].sort_values(ascending = False)[1:].plot(kind='bar')
plt.title('Features collinearity with the target',fontsize=16)
plt.ylabel('Collinearity',fontsize = 15)
plt.xticks(rotation=45)
plt.show()

In [None]:
X_train.head(10)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy  score : ' + str(round(accuracy_score(y_test,y_pred),3)))

In [None]:
sb.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
           yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])

**Our model predictedone category of the target variable good but was not able to predict the other one well**

In [None]:
from sklearn.metrics import classification_report

print('Classification Report\n',classification_report(y_test, y_pred))

In [None]:
# Import auc, roc_curve
from sklearn.metrics import auc, roc_curve

# Get false positive rate, true positive rate and threshold
fpr, tpr, threshold = roc_curve(y_test, y_pred)

# Compute Area under curve
area = auc(fpr, tpr)

# plot the roc curve
# Initialize Figure
plt.figure(figsize=(18,5))

plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % area)

# plot straight line
plt.plot([0, 1], [0, 1],color='g', marker='_')

# set plot title, xlabel, ylabel, legend
plt.title('ROC Curve', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=14)
plt.legend(loc = 'lower right', fontsize = 12)

plt.show()

**The accuracy of the model being 95% is good but the ROC curve with AUC of 0.51 is not good**