# Importing all the packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.set()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# Importing dataset

In [None]:
data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv',index_col=0)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
target=data['stroke']
target.value_counts()

In [None]:
data.drop('stroke',axis=1,inplace=True)

In [None]:
target.head()

In [None]:
data.head()

# Data Filtering

In [None]:
data.isna().sum()

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x='bmi',data=data,y='work_type',hue='Residence_type')

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x='bmi',data=data,y='work_type',hue='gender')

In [None]:
data['work_type'].value_counts()

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(x='bmi',data=data,hue='ever_married')

In [None]:
temp=data[(data['work_type']=='Never_worked')]['bmi']
temp.isna().sum()

In [None]:
temp1=data[(data['work_type']=='children')]['bmi'].fillna(data[(data['work_type']=='children')]['bmi'].mode()[0])
temp2=data[(data['work_type']=='Govt_job')]['bmi'].fillna(data[(data['work_type']=='Govt_job')]['bmi'].mode()[0])
temp3=data[(data['work_type']=='Self-employed')]['bmi'].fillna(data[(data['work_type']=='Self-employed')]['bmi'].mode()[0])
temp4=data[(data['work_type']=='Private')]['bmi'].fillna(data[(data['work_type']=='Private')]['bmi'].mode()[0])

In [None]:
temp=pd.concat([temp,temp1,temp2,temp3,temp4])
temp.isna().sum()

In [None]:
temp

In [None]:
data['bmi']=temp

In [None]:
data.head()

# Data Preprocessing

In [None]:
gender=pd.get_dummies(data['gender'])
married=pd.get_dummies(data['ever_married'],drop_first=True)
work=pd.get_dummies(data['work_type'])
residence=pd.get_dummies(data['Residence_type'],drop_first=True)
smoke=pd.get_dummies(data['smoking_status'],drop_first=True)

In [None]:
full_data=data.join([gender,married,work,residence,smoke])
full_data.head()

In [None]:
full_data.columns

In [None]:
required_data=full_data[['age','hypertension', 'heart_disease','avg_glucose_level', 'bmi','Female', 'Male','Yes', 'Govt_job',
       'Never_worked', 'Private', 'Self-employed', 'children', 'Urban','formerly smoked', 'never smoked', 'smokes']]

In [None]:
required_data.head()

# Model Creation and Predictions

In [None]:
X_train, X_test, y_train, y_test = train_test_split(required_data, target, test_size=0.25)

In [None]:
predictions=pd.DataFrame(index=y_test.index)
predictions['actual']=y_test
predictions.head()

## Random Forest Classifier

In [None]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)

In [None]:
rfc_pre=rfc.predict(X_test)
predictions['rfc']=rfc_pre

In [None]:
rfc.score(X_train,y_train)

In [None]:
print(classification_report(y_test,rfc_pre))

In [None]:
print(confusion_matrix(y_test,rfc_pre))

## Decision Tree Classifier

In [None]:
dtc=DecisionTreeClassifier()
dtc.fit(X_train,y_train)

In [None]:
dtc_pre=dtc.predict(X_test)
predictions['dtc']=dtc_pre

In [None]:
dtc.score(X_train,y_train)

In [None]:
print(classification_report(y_test,dtc_pre))

In [None]:
print(confusion_matrix(y_test,dtc_pre))

In [None]:
predictions.head(10)