<center><h1 class="list-group-item list-group-item-success">Stroke Prediction</center>
<img src = "https://topnews.in/healthcare/sites/default/files/styles/large/public/Stroke7.jpg?itok=xInaWFYK">
    
### Context
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

In [None]:
!pip install pycaret

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from imblearn.over_sampling import SMOTE
from pycaret.classification import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,mean_squared_error,mean_absolute_error,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import pickle

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.corr()

In [None]:
px.histogram(df,x='work_type',color='work_type',template='plotly_dark')

In [None]:
px.histogram(df,x='smoking_status',color='smoking_status',template='plotly_dark')

In [None]:
df["stroke"].value_counts()

In [None]:
plt.xlabel("Age")
plt.ylabel("BMI")
plt.xlim(0,40)
plt.bar(df["age"],df["bmi"])

In [None]:
plt.xlabel("Age")
plt.ylabel("BMI")
plt.xlim(40,80)
plt.bar(df["age"],df["bmi"])

In [None]:
df.dropna(inplace=True)

In [None]:
df = df.reset_index(drop = True)

In [None]:
df

In [None]:
gender = pd.get_dummies(df["gender"])
ever_married = pd.get_dummies(df["ever_married"])
work_type = pd.get_dummies(df["work_type"])
residence_type = pd.get_dummies(df["Residence_type"])
smoking_status = pd.get_dummies(df["smoking_status"])
df["gender_Female"] = gender["Female"]
df["gender_Male"] = gender["Male"]
df["gender_Other"] = gender["Other"]
df["work_type_Govt_Job"] = work_type["Govt_job"]
df["work_type_Never_Worked"] = work_type["Never_worked"]
df["work_type_Private"] = work_type["Private"]
df["work_type_Self-Employed"] = work_type["Self-employed"]
df["work_type_children"] = work_type["children"]
df["ever_married_Yes"]=ever_married["Yes"]
df["ever_married_No"]=ever_married["No"]
df["residence_type_Urban"]=residence_type["Urban"]
df["residence_type_Rural"]=residence_type["Rural"]
df["smoking_status_Unknown"] = smoking_status["Unknown"]
df["smoking_status_Formerly_Smoked"] = smoking_status["formerly smoked"]
df["smoking_status_Never_Smoked"] = smoking_status["never smoked"]
df["smoking_status_Smokes"] = smoking_status["smokes"]

In [None]:
df

In [None]:
columns_to_drop = ["id","gender","ever_married","work_type","smoking_status","Residence_type"]
df.drop(columns_to_drop,axis = 1,inplace= True)

In [None]:
df

In [None]:
X = df.drop("stroke",axis = 1)
Y = df["stroke"]

In [None]:
sm = SMOTE(random_state=27,k_neighbors=10)
X, Y = sm.fit_resample(X,Y)

In [None]:
Y.value_counts()

In [None]:
df_final = pd.concat([X,Y],axis = 1)

In [None]:
train,test = train_test_split(df_final, test_size=0.25,random_state= 27)

In [None]:
experiment = setup(
    data = train,
    target = 'stroke',
    normalize = True
)

In [None]:
compare_models()

In [None]:
X = MinMaxScaler().fit_transform(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', X_test.shape)
print('Testing Features Shape:', Y_train.shape)
print('Testing Labels Shape:', Y_test.shape)

In [None]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

In [None]:
rf.fit(X_train,Y_train)

In [None]:
Y_pred = rf.predict(X_test)

In [None]:
print(accuracy_score(Y_pred,Y_test))

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5,8,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,8]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
rf1 = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf1, param_distributions = random_grid, n_iter = 25, cv = 2, verbose=2, random_state=42, n_jobs = -1)

In [None]:
rf_random.fit(X_train, Y_train)

In [None]:
rf_random.best_params_

In [None]:
predictions = rf_random.predict(X_test)

In [None]:
print(accuracy_score(predictions,Y_test))

In [None]:
pickle.dump(rf_random,open("model_stroke.pkl","wb"))