In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
dataset.head().T

In [None]:
dataset.isnull().sum()

## Exploring the Data

In [None]:
corr_matrix = dataset.corr()

metrics = dataset[['age','heart_disease', 'hypertension','avg_glucose_level','bmi', 'stroke']]
meteric_corr= metrics.corr()
sns.heatmap(meteric_corr, annot=True)


In [None]:
dataset['stroke'].value_counts()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(metrics, figsize=(15,12), alpha=0.05, s=5)

sns.displot(data = dataset, kind = 'hist', x = 'stroke', hue = 'stroke', multiple = 'stack',bins=25,height = 4, aspect = 1.7)

In [None]:
corrmat = dataset.corr()
cmap = sns.diverging_palette(260,-10,s=50, l=75, n=6, as_cmap=True)
plt.subplots(figsize=(18,18))
sns.heatmap(corrmat,cmap= cmap,annot=True, square=True)

In [None]:
dataset.hist(figsize=(15,7))

In [None]:
dataset['smoking_status'].value_counts().plot.bar()

## Manage Null Values

In [None]:
dataset['bmi'] = dataset['bmi'].fillna(dataset['bmi'].median())

In [None]:
dataset['smoking_status']= dataset['smoking_status'].replace('Unknown', np.nan)
dataset.dropna(axis=0, inplace = True)

In [None]:
dataset.drop('id', axis =1 , inplace = True)

In [None]:
numeric= dataset[['age','hypertension','heart_disease']]
categorical = dataset[['gender','ever_married','work_type','Residence_type','smoking_status']]

In [None]:
numeric

In [None]:
categorical.head().T

## Encode the Categorical variables 

In [None]:
categorical = pd.get_dummies(categorical, columns=['gender','ever_married','work_type',
    'Residence_type','smoking_status']
    ,prefix=['gender','ever_married','work_type','Residence_type','smoking_status'], drop_first = True)

In [None]:
X = np.array(pd.concat([categorical,numeric],axis=1))
y = np.array(dataset['stroke'])
print(X)
print(y)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


### Apply Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclassifier = RandomForestClassifier(n_estimators = 10 , random_state = 0)
rfclassifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
prediction = rfclassifier.predict(X_test)
cm = confusion_matrix(y_test, prediction)
print(cm)
accuracy_score(y_test, prediction)

### XG Boost

In [None]:
from xgboost import XGBClassifier
xgclassifier = XGBClassifier()
xgclassifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
prediction = xgclassifier.predict(X_test)
cm = confusion_matrix(y_test, prediction)
print(cm)
accuracy_score(y_test, prediction)

In [None]:
from sklearn.svm import SVC
svclassifier = SVC(kernel = 'rbf', random_state = 0)
svclassifier.fit(X_train, y_train)


### Support Vector machine

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
prediction = svclassifier.predict(X_test)
cm = confusion_matrix(y_test, prediction)
print(cm)
accuracy_score(y_test, prediction)