In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load the Data

In [None]:
data  =pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

### Check the datatype

In [None]:
data.info()
print('data shape is like','\n',data.shape)
#In the Datasets we have only 12 Features and we don't want the user [id] features 
#It is independent with stroke 

### Remove unwanted features

In [None]:
data.drop(columns = 'id' ,axis = 1 ,inplace = True)
data.head()

### Visualize the datasets

In [None]:
import seaborn as sn
from matplotlib import pyplot as plt
f ,ax = plt.subplots(figsize = (5 ,5))
corr = data.corr(method = 'pearson')
sn.heatmap(corr ,linewidths = 0.2 ,annot = True)
sn.pairplot(data ,diag_kind = 'kde')
# In the data visulization avg_gulose_level and bmi have outliers

### Now we want to see the unique values in each Features


In [None]:
column = data.columns
for i in range(len(column)):
    print('-'*25 ,'\n' ,column[i],'\n',data[column[i]].unique())

### Now Encode  data in the Features using panda's Dummies function

In [None]:
data = pd.get_dummies(data ,columns = ['gender' ,'ever_married' ,
                                       'work_type' , 'Residence_type' ,
                                        'smoking_status' ])

data.head()

### check the class imbalance

In [None]:
print(data['stroke'].value_counts())
sn.histplot(data.stroke)
### This Dataset is imbalanced with class 0

### Treat the missing values

In [None]:
print(data.isnull().sum())
data.describe()

In [None]:
# Use the mean to fill the  missing value in bmi
data.bmi.fillna(value = 28.89 ,inplace = True)

### Using the SMOTE OverSampling method to treat the class imbalances

In [None]:
from imblearn.over_sampling import SMOTE
X = data.drop(columns = 'stroke')
Y = data.stroke
smote = SMOTE()
x_smote ,y_smote = smote .fit_resample(X ,Y)
y_smote.value_counts()
sn.histplot(y_smote)

### Now split the data for training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train ,X_test ,y_train ,y_test = train_test_split( x_smote ,y_smote , test_size = .3 , random_state = 41 ,shuffle = True) 

# Train the Model
_________________________________

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix ,mean_squared_error ,accuracy_score ,plot_confusion_matrix
from xgboost import XGBClassifier
models_name = ['Forest' ,'Boosting' ,'Logistic']
models = dict()
models[0] = RandomForestClassifier()
models[1] = XGBClassifier()
models[2] = LogisticRegression()

In [None]:
from sklearn.pipeline import Pipeline
rmse = []
acc =[]
model = []
for i in range(3):
    pipe = Pipeline([('Scaledata' ,StandardScaler()),('models' ,models[i])])
    model.append(pipe.fit(X_train ,y_train))
    error = mean_squared_error(y_test ,pipe.predict(X_test))
    r_sqr = pipe.score(X_test ,y_test)   
    accuracy = accuracy_score(y_test ,pipe.predict(X_test))
    rmse.append(np.sqrt(error))
    acc.append(accuracy)


In [None]:
# Types of Model's Accuracy rate and rmse
d = pd.DataFrame(columns = ['Models' ,'rmse' ,'Accuracy' ,'Cross_val_score'] )
d.rmse = rmse
d.Models = models_name
d.Accuracy = acc
d

### Cross Validation 

In [None]:
# Cross Valdating
from sklearn.model_selection import cross_val_score
CV = []
for i in range(3):
    CV.append(np.mean(cross_val_score(model[i] ,X_test ,y_test ,cv = 6)))
d.Cross_val_score = CV
d

In [None]:
d

In [None]:
color = ['Accent' ,'ocean' ,'summer']
for i in range(3):
    disp = plot_confusion_matrix( model[i] ,X_test ,y_test ,display_labels = ['Hva a Stroke' ,'Not Have a stroke'] ,cmap = plt.cm.ocean )
    disp.ax_.set_title(models_name[i])
    plt.show
