# Stroke Prediction

* Attributes Description

Variable | Description
--------------|-----------
id | Unique Identifier
gender | Male, Female or Other
age | Age of the patient
hypertension | 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
heart_disease | 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
ever_married | No or Yes
work_type | children, Govt_jov, Never_worked, Private or Self-employed
Residence_type | Rural or Urban
avg_glucose_level | Average Glucose level in blood
bmi | Body Mass Index
smoking_status | formerly smoked, never smoked, smokes or Unknown
stroke | 1 if the patient had a stroke or 0 if not



### Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
# As 'id' Column is of no use. So, we drop that column
df = df.drop(['id'], 1)

In [None]:
# Statistical info
df.describe()

In [None]:
# Datatypes of Attributes
df.info()

In [None]:
# Check the unique values in dataset
df.apply(lambda x: len(x.unique()))

### Data Preprocessing

In [None]:
# To check the null values
df.isnull().sum()

* In 'bmi' column = 201 values are null

In [None]:
# Check the distribution of 'bmi'
sns.distplot(df['bmi'])

- As 'bmi' is normally distributed. So, we will fill null values with mean.

In [None]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
# df['bmi'].replace(to_replace=np.nan, value=df['bmi'].mean(), inplace=True)

In [None]:
df.head()

In [None]:
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

In [None]:
# print the categorical columns
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

#### Gender

In [None]:
gender_stroke = df.groupby(["gender","stroke"])[["stroke"]].count()
gender_stroke

- We see a similar stroke rate in Male and Female. There is 'Other' gender which only have 1 sample. We remove this.

In [None]:
df_delete = df[df['gender'] == 'Other'].index
df = df.drop(df_delete)

## Exploratory Data Analysis

In [None]:
sns.boxplot(df['bmi'])

### Dealing with Outliers

In [None]:
# Using quantile method to remove outliers for bmi index
q = df['bmi'].quantile(0.99)
df = df[df['bmi']<q]

* Numerical variable
 - Continous: age, avg_glucose_level, bmi
 

* Categorical : gender, stroke, smoking_status,heart_disease, ever_married, hypertension, work_type, Residence_type

In [None]:
categorical = (df.dtypes == "object")
categorical_list = list(categorical[categorical].index)

print("Categorical variables:")
print(categorical_list)

print('-----------------------------------\n')

numerical = (df.dtypes == "float64")
numerical_list = list(numerical[numerical].index)

print("Numerical variables:")
print(numerical_list)

print('-----------------------------------\n')

In [None]:
#----------------DATA COPY FOR VISUALIZATION--------------------

StrokeAnalysis = df.copy()

StrokeAnalysis['hypertension'] = StrokeAnalysis['hypertension'].apply(lambda x : 'Hypertension' if x == 1 else 'No Hypertension') 
StrokeAnalysis['heart_disease'] = StrokeAnalysis['heart_disease'].apply(lambda x : 'Heart Disease' if x == 1 else 'No Heart Disease') 
StrokeAnalysis['stroke'] = StrokeAnalysis['stroke'].apply(lambda x : 'Suffered Stroke' if x == 1 else 'Never Suffered Stroke') 
StrokeAnalysis['ever_married'] = StrokeAnalysis['ever_married'].apply(lambda x : 'Married' if x == 'Yes' else 'Unmarried') 


# NO HYPERTENSION VS HYPERTENSION & NO HEART DISEASE VS HEART DISEASE

plt.figure(figsize=(10,6))
placement = 1

for i in ['hypertension','heart_disease']:
    label = []
    value = []
    for j in range(len(StrokeAnalysis[i].value_counts().index)):
        label.append(StrokeAnalysis[i].value_counts().index[j])
        value.append(StrokeAnalysis[i].value_counts()[j])
        
    plt.subplot(1,2,placement)    
    explode = (0.1, 0.2)
    plt.pie(value, labels = label, autopct='%1.2f%%', shadow=True, explode=explode)
    plt.title("{} VS {}".format(label[0],label[1]))
    placement += 1
    
plt.tight_layout(pad=0.4)  



* The propotion of patients with hypertension and heart diseases are very low.

In [None]:
# MARRIED VS UNMARIED & URBAN VS RURAL

plt.figure(figsize=(10,6))
placement = 1

for i in ['ever_married','Residence_type']:
    label = []
    value = []
    for j in range(len(StrokeAnalysis[i].value_counts().index)):
        label.append(StrokeAnalysis[i].value_counts().index[j])
        value.append(StrokeAnalysis[i].value_counts()[j])
        
    plt.subplot(1,2,placement)    
    explode = (0.1, 0.2)
    plt.pie(value, labels = label,autopct='%1.2f%%', shadow=True, explode=explode)
    plt.title("{} VS {}".format(label[0],label[1]))
    placement += 1
    
plt.tight_layout(pad=0.4) 



* We see that the number of patients married are much more. The proportion of rural and urban patients are almost equal.

In [None]:
# WORK TYPE

print("Pie plot of Work Type")
print('------------------------------------------------------------\n')


plt.subplot(1,1,1)  
label = StrokeAnalysis['work_type'].value_counts().index
value = StrokeAnalysis['work_type'].value_counts().values
plt.pie(value, labels = label,autopct='%1.2f%%', shadow=True, explode=None)
plt.title("Work Types")

plt.tight_layout(pad=0.4) 





In [None]:
# Visualization of the distribution of Numerical columns

print("Visualization of the distribution of Numerical columns")
print('------------------------------------------------------------\n')


for n in numerical_list:
     plt.figure(figsize = (9,3))
     plt.hist(df[n], bins = 50)
     plt.xlabel(n)
     plt.ylabel("Frequency")
     plt.title("{} Distribution with Histogram".format(n))
     plt.show()


In [None]:
# Visualization of stroke vs other columns

print("Visualization of stroke vs other columns")
print('-----------------------------------------------------\n')

for i in categorical_list:
    ax = sns.countplot(data=df, x=i,hue="stroke")
    plt.title("Effect of {} on Stroke".format(i))
    
    for p in ax.patches:
        ax.annotate(f'{round(p.get_height()/len(df)*100,2)} %', xy=(p.get_x() + p.get_width() / 2,  
            p.get_height()), ha='center', va='center', size=13, xytext=(0, 8), textcoords='offset points')
    plt.show()


In [None]:
# Heatmap Correlation

print("Heatmap Correlation")
print('------------------------------------------------------------------\n')

plt.figure(figsize = (8,6))  
sns.heatmap(df.corr(), annot = True,cmap="coolwarm")
plt.show()


### Input Split

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

### Encoding Categorical Data

* Independent Variable
  - OneHotEncoder

* Dependent Variable
  - LebelEncoder

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X.shape

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Initializing empty lists to append all model's name and corresponding name
accuracy = []
model = []

## Training the Model on the Training set

### Kernel SVM

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf',gamma='scale', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix and Calculating the Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

acc2 = accuracy_score(y_test, y_pred)
accuracy.append(acc2)
model.append('Kernel SVM')
print("Kernel SVM's Accuracy :", acc2)

# Cross validation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train, cv=10)
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(score.mean()*100))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# F1- score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: {:.2f}'.format(f1))

### K - Nearest Neighbors

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_knn = sc.fit_transform(X_train[:, 16:])
X_test_knn = sc.transform(X_test[:, 16:])

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(X_train_knn, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test_knn)

# Making the Confusion Matrix and Calculating the Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

acc3 = accuracy_score(y_test, y_pred)
accuracy.append(acc3)
model.append('K-Nearest Neighbors')
print("K-Nearest Neighbours's Accuracy :", acc3)

# Cross validation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train, cv=10)
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(score.mean()*100))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# F1- score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: {:.2f}'.format(f1))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix and Calculating the Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

acc4 = accuracy_score(y_test, y_pred)
accuracy.append(acc4)
model.append('Naive Bayes')
print("Naive Bayes's Accuracy :", acc4)

# Cross validation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train, cv=10)
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(score.mean()*100))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# F1- score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: {:.2f}'.format(f1))

### Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix and Calculating the Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

acc5 = accuracy_score(y_test, y_pred)
accuracy.append(acc5)
model.append('Decision Tree Classification')
print("Decision Tree Classification's Accuracy :", acc5)

# Cross validation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train, cv=10)
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(score.mean()*100))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# F1- score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: {:.2f}'.format(f1))

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators= 100, criterion = 'gini' , random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix and Calculating the Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

acc6 = accuracy_score(y_test, y_pred)
accuracy.append(acc6)
model.append('Random Forest Classification')
print("Random Forest Classification's Accuracy:", acc6)

# Cross validation score
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X_train, y_train, cv=10)
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(score.mean()*100))
print("Standard Deviation: {:.2f} %".format(score.std()*100))

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# F1- score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: {:.2f}'.format(f1))

## Model Accuracy

In [None]:
plt.xticks(rotation=90)
sns.barplot(x = model, y = accuracy, palette ='dark')