# Objective
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

In [None]:
import pandas as pd 
import seaborn as sns
import warnings
import numpy as np
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
warnings.filterwarnings('ignore')
data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.info()

From the above information it is clear that only some data in bmi column is missing and rest columns have 5110 numbers of attributes.

Id doesn't seems to have any impact on the output prediction thus can drop it.

In [None]:
data.drop(['id'],axis=1,inplace=True)

In [None]:
data.describe()

#### from the above table it is quite evident that though most of the columns are uniformly distributed but columns like ['hypertension','heart_disease','stroke'] have more than 75 percentile as 0 and very few numbers of 1.

In [None]:
data.describe(include='object')

In categorical dtypes all data seems to be uniformly distributed

## Gender

In [None]:
data.gender.value_counts()

In [None]:
plt.pie(data.gender.value_counts(),explode=[.1,.3,.2],startangle=90,autopct='%.2f%%',labels=['female','male','other'],radius=10,colors=['blue','pink','red'])
plt.axis('equal')
plt.title('Gender',fontdict={'fontsize':22,'fontweight':'bold'})
plt.show()

In [None]:
data.drop(data[data.gender=='Other'].index,inplace=True,axis=0)

As there is only one row with gender as 'Other' so can drop it. By treating it as outlier

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data.gender,hue=data.stroke,palette='Paired_r')
plt.show()

From above countplot it is quite evident that females are quite less prone to strokes, though number of female data is more than that of male.

In [None]:
data.gender=pd.Categorical(data.gender,categories=['Male','Female'],ordered=True).codes

## Ever Married

In [None]:
data.ever_married.value_counts()

In [None]:
plt.pie(data.ever_married.value_counts(),colors=['green','orchid'],autopct='%0.2f%%',explode=[.1,.2],startangle=90,labels=['Married','Unmarried'],radius=10)
plt.title('Marital Status',fontdict={'fontsize':22,'fontweight':'bold'})
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data.ever_married,hue=data.stroke,palette='CMRmap_r')
plt.show()

In [None]:
data.ever_married=pd.Categorical(data.ever_married,categories=['No','Yes'],ordered=True).codes

## Work Type

In [None]:
plt.figure(figsize=(10,8))
plt.pie(data.work_type.value_counts(),autopct='%.2f%%',explode=[.1,.2,.2,1,.3],radius=20,startangle=90,labels=['Private','self-employed','children','Govt_job','never_worked'])
plt.title('Type of work',fontdict={'fontsize':22,'fontweight':'bold'})
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
pt=sns.countplot(data.work_type,hue=data.stroke)
plt.show()

It is quite evident that the childrens are very less prone to stroke.

In [None]:
data.work_type=pd.Categorical(data.work_type).codes

## Recidence type

In [None]:
plt.figure(figsize=(10,8))
plt.pie(data.Residence_type.value_counts(),autopct='%.2f%%',explode=[.1,.2],radius=10,startangle=90,labels=['Urban','Rural'])
plt.title('Type of recidence',fontdict={'fontsize':22,'fontweight':'bold'})
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data.Residence_type,hue=data.stroke,palette='CMRmap')
plt.show()

In [None]:
data.Residence_type=pd.Categorical(data.Residence_type,categories=['Rural','Urban'],ordered=True).codes

## Body Mass Index

In [None]:
(data.bmi.isnull().sum()/len(data.bmi))*100

In [None]:
data.bmi.fillna(method='bfill',inplace=True)

Filling empty values in BMI with mean value of the BMI column.

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(data.bmi)
plt.title('BMI',fontdict={'fontsize':22,'fontweight':'bold'})
plt.show()

The graph of BMI distribution is quite uniform

## Smoking Status

In [None]:
plt.figure(figsize=(10,8))
plt.pie(data.smoking_status.value_counts(),autopct='%.2f%%',explode=[.1,.2,1,.3],radius=10,startangle=90,labels=['Never smoked','unknown','formerly smoked','smokers'])
plt.title('Smoking Status',fontdict={'fontsize':22,'fontweight':'bold'})
plt.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data.smoking_status,hue=data.stroke,palette='PuBu')
plt.show()

In [None]:
data.smoking_status=pd.Categorical(data.smoking_status,categories=['never smoked','Unknown','formerly smoked','smokes'],ordered=True).codes

## Multi-Variate Analysis 

In [None]:
plt.figure(figsize=(20,10))
sns.pairplot(data)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(data.corr(),annot=True)
plt.show()

we can observe here that ever_married and age have high correlation i.e, 68% but as it is less then 90% it have no chane of collinearity in the feature.

In [None]:
data.skew()

## spliting of data 

As we have seen earlier that [ 'hypertension' , 'heart_disease' , 'stroke'] are highly skew so we use here stratified suffule split to evenly split data. 


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(test_size=.2,random_state=42,n_splits=1)
for train_index,test_index in split.split(data,data['stroke']):
    train_data=data.iloc[train_index]
    test_data=data.iloc[test_index]

In [None]:
train_x=train_data.iloc[:,:10]
train_y=train_data.iloc[:,10]
test_x=test_data.iloc[:,:10]
test_y=test_data.iloc[:,10]

## Model Selection

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression(max_iter=1000,tol=.01)
model1.fit(train_x,train_y)

In [None]:
train_pred=model1.predict(train_x)
test_pred=model1.predict(test_x)
print('Classification Report of train_data \n',classification_report(train_y,train_pred))
print('Classification Report of test_data \n',classification_report(test_y,test_pred))

# DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
model2=DecisionTreeClassifier(max_depth=8)
model2.fit(train_x,train_y)

In [None]:
train_pred=model2.predict(train_x)
test_pred=model2.predict(test_x)
print('Classification Report of train_data \n',classification_report(train_y,train_pred))
print('Classification Report of test_data \n',classification_report(test_y,test_pred))

## RandomForestClassifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier
model3=RandomForestClassifier(max_depth=8)
model3.fit(train_x,train_y)

In [None]:
train_pred=model3.predict(train_x)
test_pred=model3.predict(test_x)
print('Classification Report of train_data \n',classification_report(train_y,train_pred))
print('Classification Report of test_data \n',classification_report(test_y,test_pred))

## KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier()
model4=GridSearchCV(KNN,param_grid={'n_neighbors':range(3,9)})
model4.fit(train_x,train_y)

In [None]:
model4.best_params_

In [None]:
train_pred=model4.predict(train_x)
test_pred=model4.predict(test_x)
print('Classification Report of train_data \n',classification_report(train_y,train_pred))
print('Classification Report of test_data \n',classification_report(test_y,test_pred))

#### Conclusion:- Though test accuracy of all models are about same i .e, 95% but LogisticRegression can be selected as an apt model because it give good precision for predicting 1 but no other model have and good responce in predicting 1. 