# **Introduction**

Stroke is a critical health problem globally. It remains as the second leading cause of death worldwide since 2000. Apart from that, stroke is the third major cause of disability. Long term disability affects people severely, in terms of their productive life. As such, stroke possesses significant threat to global health.

# **Data Analysis**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
data_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data_df.head()

In [None]:
data_df.shape

In [None]:
data_df.info()

In [None]:
data_df.isnull().sum()

In [None]:
data_df['stroke'].value_counts()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Stroke Count")
g = plt.pie(data_df.stroke.value_counts(),explode=(0.025,0.025), labels=data_df.stroke.value_counts().index, colors=['blue', 'red'],autopct='%1.1f%%', startangle=180)

plt.show()

In [None]:
data_df['work_type'].value_counts()

# **Data Preprocessing**

In [None]:
# Drop the id column
data_df.drop(columns=['id'], inplace=True)

In [None]:
# Showing records where patient suffered from stroke but had missing value in bmi attribute.
data_df[data_df['bmi'].isna() & data_df['stroke'] == 1]

In [None]:
# Replace the missing values with mean of bmi attribute
data_df['bmi'].fillna(np.round(data_df['bmi'].mean(), 1), inplace = True)

**Normalize numerical attributes**

In [None]:
# Create a new column for normalized age
data_df['age_norm']=(data_df['age']-data_df['age'].min())/(data_df['age'].max()-data_df['age'].min())

In [None]:
# Create a new column for normalized bmi
data_df['bmi_norm']=(data_df['bmi']-data_df['bmi'].min())/(data_df['bmi'].max()-data_df['bmi'].min())

In [None]:
# Create a new column for normalized avg glucose level
data_df['avg_glucose_level_norm']=(data_df['avg_glucose_level']-data_df['avg_glucose_level'].min())/(data_df['avg_glucose_level'].max()-data_df['avg_glucose_level'].min())

In [None]:
# Male=>1, Female=>0, Other=>2
data_df['gender'].replace(['Male', 'Female', 'Other'], [1, 0, 2],inplace = True)
# Urban=>1, Rural=>0
data_df['Residence_type'].replace(['Urban', 'Rural'], [1, 0], inplace = True)
# never smoked=>0, formerly smoked=>1, Unknown=>2, smokes=>3
data_df['smoking_status'].replace(['never smoked', 'formerly smoked', 'Unknown', 'smokes'], [0, 1, 2, 3], inplace = True)
# Yes=>1, No=>0
data_df['ever_married'].replace(['Yes', 'No'], [1, 0],inplace = True)
# Never_worked=>0, Self-employed=>1, children=>2, Govt_job=>3, Private=>4
data_df['work_type'].replace(['Never_worked', 'Self-employed', 'children', 'Govt_job', 'Private'], [0, 1, 2, 3, 4], inplace = True)


In [None]:
data_df.drop(['age', 'avg_glucose_level', 'bmi'], axis=1, inplace=True)

In [None]:
data_df.head()

# **Exploratory Data Analysis**

In [None]:
# Create the correlation heatmap
heatmap = sns.heatmap(data_df[['age_norm', 'avg_glucose_level_norm', 'bmi_norm']].corr(), vmin=-1, vmax=1, annot=True)
# Create the title
heatmap.set_title('Correlation Heatmap');

In [None]:
def get_stacked_bar_chart(column):
    # Get the count of records by column and stroke    
    df_pct = data_df.groupby([column, 'stroke'])['age_norm'].count()
    # Create proper DataFrame's format
    df_pct = df_pct.unstack()    
    return df_pct.plot.bar(stacked=True, figsize=(6,6), width=1);

In [None]:
get_stacked_bar_chart('smoking_status')

# **Apply Machine Learning**

**Splitting the dataset into the Training set and Test set**

In [None]:
x_train = data_df.drop('stroke',axis=1)
y_train = data_df['stroke']

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train,y_train, test_size= 0.2, random_state= 5)

In [None]:
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape

In [None]:
models = [LogisticRegression(),RandomForestClassifier(),
          DecisionTreeClassifier(), KNeighborsClassifier(),
         GaussianNB(),SVC()]

In [None]:
scores = []
labels = []

for x in models:
    m =x
    m.fit(X_train,Y_train)
    predict =x.predict(X_val)
    score=m.score(X_val,Y_val)
    scores.append(score)
    labels.append(x.__class__.__name__)

In [None]:
result={'Model':labels,'Accuracy':scores}
frame=pd.DataFrame(result)
frame.sort_values(by ='Accuracy', ascending = False)