## Model Training


## Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [8]:

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sklearn

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, roc_curve,precision_recall_curve, auc,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.impute import KNNImputer

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# from catboost import CatBoostClassifier

## Import the CSV Data as Pandas DataFrame

In [10]:
def read_data():
    df = pd.read_csv('data/strokeprediction.csv')
    df.head()
    return df

In [11]:
df=read_data()

## Preparing X and Y variables

In [12]:
X = df.drop(columns=['stroke','id'],axis=1)

In [13]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [14]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'Residence_type' variable:  ",end=" ")
print(df['Residence_type'].unique())

print("Categories in 'smoking_status' variable:",end=" " )
print(df['smoking_status'].unique())

print("Categories in 'work_type' variable:     ",end=" " )
print(df['work_type'].unique())


Categories in 'gender' variable:      ['Male' 'Female' 'Other']
Categories in 'Residence_type' variable:   ['Urban' 'Rural']
Categories in 'smoking_status' variable: ['formerly smoked' 'never smoked' 'smokes' 'Unknown']
Categories in 'work_type' variable:      ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']


In [15]:
y = df['stroke']

In [16]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64

In [17]:
le = LabelEncoder()
X = X.apply(le.fit_transform)

In [18]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,88,0,1,1,2,1,3850,239,1
1,0,82,0,0,1,3,0,3588,418,2
2,1,101,0,1,1,2,0,2483,198,2
3,0,70,0,0,1,2,1,3385,217,3
4,0,100,1,0,1,3,0,3394,113,2
...,...,...,...,...,...,...,...,...,...,...
5105,0,101,1,0,1,2,1,1360,418,2
5106,0,102,0,0,1,3,1,3030,273,2
5107,0,56,0,0,1,3,0,1314,179,2
5108,1,72,0,0,1,2,0,3363,129,1


In [19]:
X.shape

(5110, 10)

In [20]:
#removing null values
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit_transform(X)

array([[1.000e+00, 8.800e+01, 0.000e+00, ..., 3.850e+03, 2.390e+02,
        1.000e+00],
       [0.000e+00, 8.200e+01, 0.000e+00, ..., 3.588e+03, 4.180e+02,
        2.000e+00],
       [1.000e+00, 1.010e+02, 0.000e+00, ..., 2.483e+03, 1.980e+02,
        2.000e+00],
       ...,
       [0.000e+00, 5.600e+01, 0.000e+00, ..., 1.314e+03, 1.790e+02,
        2.000e+00],
       [1.000e+00, 7.200e+01, 0.000e+00, ..., 3.363e+03, 1.290e+02,
        1.000e+00],
       [0.000e+00, 6.500e+01, 0.000e+00, ..., 1.454e+03, 1.350e+02,
        0.000e+00]])

In [21]:
X.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

## Train test split

In [22]:
# separate dataset into train and test
def sllit_data(X,y):
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=23)
    sm = SMOTE()
    X_res, y_res = sm.fit_resample(x_train,y_train)
    return X_res,y_res,x_train,x_test,y_train,y_test

In [23]:
X_res,y_res,x_train,x_test,y_train,y_test=sllit_data(X,y)

## Create an Evaluate Function to give all metrics after model Training

In [33]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

## Compare Models

In [33]:
def comapare_model(X_res,y_res,x_train,x_test,y_train,y_test):  
    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest Classifier": RandomForestClassifier(),
        "KNearest Neighbor": KNeighborsClassifier(),
        "Decision Tree ": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "XGBClassifier": XGBClassifier(), 
        "AdaBoost Classifier": AdaBoostClassifier(),
        "SVM" : SVC(),
    }
    model_list = []
    acc =[]

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_res, y_res) # Train model

        # Make predictions
        y_train_pred = model.predict(X_res)
        y_test_pred = model.predict(x_test)
        
        print(list(models.keys())[i])
        print('Validation Acuuracy: ',accuracy_score(y_test,model.predict(x_test)))
        print('Training Accuracy: ',accuracy_score(y_train,model.predict(x_train)))
        print('############################################')

        
        # print(list(models.keys())[i])
        model_list.append(list(models.keys())[i])
        
        # print('Model performance for Training set')
        # print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        # print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        # print("- R2 Score: {:.4f}".format(model_train_r2))

        # print('----------------------------------')
        
        # print('Model performance for Test set')
        # print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        # print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        # print("- R2 Score: {:.4f}".format(model_test_r2))
        acc.append(accuracy_score(y_test,model.predict(x_test)))
        
        # print('='*35)
        print('\n')
        concatenate(model_list,acc)
        

## Results

In [34]:
def concatenate(model_list,acc):
    print(pd.DataFrame(list(zip(model_list, acc)), columns=['Model Name', 'Accuracy']).sort_values(by=["Accuracy"],ascending=False))

In [35]:
comapare_model(X_res,y_res,x_train,x_test,y_train,y_test)

Logistic Regression
Validation Acuuracy:  0.723091976516634
Training Accuracy:  0.7377690802348337
############################################


            Model Name  Accuracy
0  Logistic Regression  0.723092
Random Forest Classifier
Validation Acuuracy:  0.8610567514677103
Training Accuracy:  1.0
############################################


                 Model Name  Accuracy
1  Random Forest Classifier  0.861057
0       Logistic Regression  0.723092
KNearest Neighbor
Validation Acuuracy:  0.7426614481409002
Training Accuracy:  0.8336594911937377
############################################


                 Model Name  Accuracy
1  Random Forest Classifier  0.861057
2         KNearest Neighbor  0.742661
0       Logistic Regression  0.723092
Decision Tree 
Validation Acuuracy:  0.8463796477495108
Training Accuracy:  1.0
############################################


                 Model Name  Accuracy
1  Random Forest Classifier  0.861057
3            Decision Tree   0.846380

## XG BOOST

## Model Training and Testing

In [38]:
def train_model(x_train,y_train,x_test,y_test):
    model_xgb = XGBClassifier()
    model_xgb.fit(x_train,y_train)
    y_pred=model_xgb.predict(x_test)
    pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred})
    print("Confusion matrix",confusion_matrix(y_test,y_pred))
    print("Accuracy",accuracy_score(y_test,y_pred))
    return pred_df

## Actual and Predicted Values

In [39]:
pred_df=train_model(x_train,y_train,x_test,y_test)
pred_df

Confusion matrix [[958   9]
 [ 50   5]]
Accuracy 0.9422700587084148


Unnamed: 0,Actual Value,Predicted Value
4893,0,0
4819,0,0
2732,0,0
1413,0,0
3875,0,0
...,...,...
330,0,0
2960,0,0
2120,0,0
744,0,0
