# Stroke Prediction and analysis

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRXPPX4JDPfzzwDVuommp6FqgA1WgZjtTuh_gKxG6YBt7j68kOXsgcZGBgXlzLG9dvIimg&usqp=CAU)

In [None]:
#Importing required libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
import warnings
warnings.filterwarnings(action="ignore")


In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

# Data Profiling

In [None]:
df.shape

In [None]:
df.info()

In [None]:
(df.isnull().sum()/len(df))*100

**Observation:**
* 3.9% of values are missing in bmi column. Since it is less than 5%. We can impute those values based on the age factor

In [None]:
#Function to impute missing values in bmi column based on the age

bmi_less_than_age_15 = np.mean(df[df['age']<=15]['bmi'])
bmi_age_15_to_50 = np.mean(df[(df['age']>15) & (df['age']<=50)]['bmi'])
bmi_age_greater_50 = np.mean(df[df['age']>50]['bmi'])

def bmi_imputation(data):
    for index,row in data.iterrows():
        if math.isnan(row['bmi']):
            if row['age'] <=15:
                data.loc[index,'bmi'] = bmi_less_than_age_15
            elif (row['age'] >15) & (row['age']<=50):
                data.loc[index,'bmi'] = bmi_age_15_to_50
            elif row['age'] >50:
                data.loc[index,'bmi'] = bmi_age_greater_50
    return data

In [None]:
df1 = bmi_imputation(df)
df1.isnull().sum()

In [None]:
df1.describe()

#  **Exploratory Data Analysis**

In [None]:
sns.countplot(x="stroke",data=df1,hue='gender')

**Observation**
* We could notice that the dataset is imbalanced
* Among those who have experienced stroke, females are higher

Let's focus on the people who are affected by Stroke !

In [None]:
ax=sns.countplot(x='smoking_status',data=df1[df1['stroke']==1])
plt.title("Smoking status of persons affected by Stroke")
total_1 =float(len(df1[df1['stroke']==1]))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='right')
plt.show()

Observation:
* Around 45% of the people who are affected by strokes, they have either formaly smoked or smokes. So, Smoking might also be a factor which could cause strokes

In [None]:
# sns.displot(data=df1, x="age", col="stroke", kind="kde")
sns.distplot(df1[df1['stroke']==1]['age'],color='red')
plt.title("Distribution of age of people affected by stroke")

**Observation:**
* Most of them who got stroke are older people aged above 60. Age factor could be a potential threat for stroke. Older people should monitor their condition with atmost care to prevent stroke.
* Younger generation might be less susceptible to stroke


In [None]:
work_type = df1[df1['stroke'] == 1]['work_type']
values = work_type.value_counts()
labels = values.keys()
bar,ax = plt.subplots(figsize=(7,7))
plt.pie(x = values, labels = labels , autopct="%.2f%%",pctdistance=0.8)
plt.title('Work type of people affected by Stroke', fontsize=20)

**Observation**
* 59.84% of the people who are affected by stroke belongs to private work type. Very less percent of around 13% of the people belongs to Govt Job. So, Based on the analysis we could say that the candiates working in private jobs might experience high stress or pressure.

In [None]:
sns.distplot(df1[df1['stroke']==1]['bmi'],color='purple')
plt.title("Distribution of bmi of people affected by stroke")

Obervation:
* 

**Observation**
* Average BMI value of people affected by stroke is 30. BMI value of greater than 25 is considered to be obese. On a average we could say that people who are affected by stroke are obese. From the graph we could notice that few have bmi of greater than 45, they are considered to be morbidly obese

In [None]:
sns.boxplot(x="stroke",y="avg_glucose_level",data=df1)
plt.title("Average glucose level")

**Observation**
* People who are affected by Stroke have higher glucose level of above 100 on average. A fasting blood sugar level of 100-125 is considered to be prediabetic.

In [None]:
ax=sns.countplot(x='heart_disease',data=df1[df1['stroke']==1])
plt.title("Heart disease of persons affected by Stroke")
total_1 =float(len(df1[df1['stroke']==1]))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='right')
plt.show()

**Observation:**
* 81 percent of the people are who are affected by stroke dont have a heart disease. Heart disease might not be a significant reason for stroke

In [None]:
ax=sns.countplot(x='hypertension',data=df1[df1['stroke']==1])
plt.title("Hyper tension of persons affected by Stroke")
total_1 =float(len(df1[df1['stroke']==1]))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='right')
plt.show()
# df1.head()

**Observation**
* 73% of the people who are affected by Stroke are not suffering from Hypertension. So, Even Hypertension might not be a significant reason for stroke

In [None]:
ax=sns.countplot(x='Residence_type',data=df1[df1['stroke']==1])
plt.title("Hyper tension of persons affected by Stroke")
total_1 =float(len(df1[df1['stroke']==1]))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='right')
plt.show()

**Observation:**
* Around 54% of the people affected by Stroke are from Urban area. A possible reason could be people in urban areas might experience health issues due to pollution and other factors such as eating habits, etc

In [None]:
corr = df1.corr()
corr.style.background_gradient(cmap='coolwarm')

**Observation**
* Age is almost slightly positive correlated with other features such as hyper tension, heart disease, glucose level, bmi and even stroke. So, As the age increases, the health related issues also increases in general. Here, Age could be a good indicator for predicting whether a person will be affected by Stroke or not

# **Feature Engineering**

In [None]:
#Dropping id column because of high cardinality
df1.drop(['id'],inplace=True,axis=1)

In [None]:
df2=pd.get_dummies(df1,columns=['gender','ever_married','work_type','Residence_type','smoking_status'],drop_first=True)
df2.head()

In [None]:
#Standardization of variables
scaled_data = df2.copy()
col_names=['age','avg_glucose_level','bmi']
features = scaled_data[col_names]
scaler = StandardScaler().fit(features)
features = scaler.transform(features)
scaled_data[col_names] = features
scaled_data.head()

In [None]:
#Data is imbalanced
scaled_data['stroke'].value_counts()

In [None]:
X = scaled_data.drop(['stroke'],axis=1)
y = scaled_data['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

# SMOTE - Handling imbalances in data

In [None]:

sm = SMOTE(random_state=42)
X_train_smote,y_train_smote = sm.fit_resample(X_train,y_train)

In [None]:
y_train_smote.value_counts()

# Model Building and GridSearchCV

In [None]:
models = {
    'SVM':{'model':SVC(gamma='auto',C=5,kernel='rbf'),'params': {'C': [1,5,10]}},
    'xgboost':{'model':xgb.XGBClassifier(),'params': {'max_depth':[4,6,8],'gamma': [0.5, 1, 2, 5]}},
     'Logistic regression':{'model':LogisticRegression(),'params':{}},
     'DecisionTree':{'model':DecisionTreeClassifier(),'params':{'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8]}}
              }


In [None]:
scores = []

for model_name, mp in models.items():
    clf =  GridSearchCV(mp['model'],mp['params'] ,cv= 2, return_train_score=False)
    clf.fit(X_train_smote, y_train_smote)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_model = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model

In [None]:
model=xgb.XGBClassifier(gamma= 0.5,max_depth=8)
xgb_model = model.fit(X_train_smote, y_train_smote)
pred = xgb_model.predict(X_test)

In [None]:
print(classification_report(pred,y_test))

In [None]:
model=SVC(gamma='auto',C=10,kernel='rbf')
svc_model = model.fit(X_train_smote, y_train_smote)
pred_svc = svc_model.predict(X_test)

In [None]:
print(classification_report(pred_svc,y_test))

In [None]:
model=DecisionTreeClassifier(criterion= 'entropy', max_depth= 8)
dt_model = model.fit(X_train_smote, y_train_smote)
pred_dt = dt_model.predict(X_test)

In [None]:
print(classification_report(pred_dt,y_test))

In [None]:
model=LogisticRegression()
log_model = model.fit(X_train_smote, y_train_smote)
pred_log = log_model.predict(X_test)


In [None]:
print(classification_report(pred_log,y_test))

**Observation**
* Xgboost has a higher accuracy of 93%. But precision and recall value for class 1 is very less. In this usecase predicting correctly the positive class (1) is important
* SVM, Decicion Tree, Logistic regression has better precision and recall values for class 1 labels

In [None]:
X_train_smote.shape

# Deep learning - ANN Model

In [None]:
ann_model = Sequential()
ann_model.add(Dense(14,input_dim=16,activation='relu'))
ann_model.add(Dense(8,activation='relu'))
ann_model.add(Dense(1,activation='sigmoid'))

In [None]:
ann_model.compile(loss="binary_crossentropy", optimizer='SGD',metrics=['accuracy'])

In [None]:
ann_model.fit(X_train_smote, y_train_smote, epochs=100, batch_size=10)

In [None]:
ann_predictions = ann_model.predict_classes(X_test)

In [None]:
print(classification_report(ann_predictions,y_test))

**Observation:**
* The accuracy of ANN model model is 83%. However, it has reasonably good score for class 1 labels when compared to xgboost