# <u><b>Advanced Analysis For Stroke Prediction Dataset

In [None]:
# Data manipulation libraries
import pandas as pd 
import numpy as np 

# Visualization libraries
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [None]:
data = pd.read_csv('D:/3rd Year - 2nd Semester/ST 3082 - Statistical Learning I/Data Analysis Final Project/Stroke Prediction/healthcare-dataset-stroke-data.csv')
data.head(5)

In [None]:
data.drop(['id'],axis=1,inplace=True)
data['work_type'] = data['work_type'].replace('Self-employed', 'Self_employed')
data['smoking_status'] = data['smoking_status'].replace('formerly smoked', 'formerly_smoked')
data['smoking_status'] = data['smoking_status'].replace('never smoked', 'never_smoked')

In [None]:
#Removing the other from gender
data.drop(data[data['gender'] == 'Other'].index, inplace = True)
data["gender"].value_counts()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder

DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
                              ])
X = data[['age','gender','ever_married','Residence_type','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1}).astype(np.uint8)
X.Residence_type = X.Residence_type.replace({'Urban':0,'Rural':1}).astype(np.uint8)
X.ever_married = X.ever_married.replace({'No':0,'Yes':1}).astype(np.uint8)
Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender','ever_married','Residence_type']]),index=Missing.index)
data.loc[Missing.index,'bmi'] = predicted_bmi

In [None]:
data = pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
data.head()

In [None]:
# shuffling the dataset before model development
data = data.sample(frac = 1)
data.head()

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(42)  # set the random seed for reproducibility

X = data.drop(['stroke'], axis=1)
y = data['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.shape

# Using SMOTE

In [None]:
# Using SMOTE
from imblearn.over_sampling import SMOTE

sampler = SMOTE(random_state = 42)

X_train,y_train= sampler.fit_resample(X_train,y_train)
y_tr = pd.DataFrame({'stroke':y_train}) #y_tr is used just to draw graph
sns.countplot(data = y_tr, x = 'stroke', y= None)
plt.show()

In [None]:
###Libraries for Modedl Fitting

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

### Defining Functions to easily create Confusion matric and ROC curve for each model

In [None]:
### Function For ROC Curve
#Inputs are y_test and y_prob
#Make sure to calculate y_prob in each model before generating ROC curve

def plot_ROC(y_test, y_prob):
    from sklearn import metrics
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
    roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
    sns.set_theme(style = 'white')
    plt.figure(figsize = (3, 3))
    plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
### Function For Confusion matrix

def plot_confusion_matrix(y_test, y_pred):
    acc = round(accuracy_score(y_test, y_pred), 2)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y_test')
    plt.title('Accuracy Score: {0}'.format(acc), size=10)
    plt.show(block=True)

## <u>Model Fitting

### <u> Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression()
logistic_model = LR.fit(X_train,y_train)
y_pred=logistic_model.predict(X_test)
class_report=classification_report(y_test,y_pred)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC 

y_prob = logistic_model.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)

In [None]:
###Using the function that we defined above to create confusion matrix

plot_confusion_matrix(y_test, y_pred)

In [None]:
#Trying to check overfiitng by predicting training set

y_pred=logistic_model.predict(X_train)
class_report=classification_report(y_train,y_pred)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_train, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train,y_pred))

## <u>KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier().fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
y_prob = knn_model.predict_proba(X_test)[:, 1]

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob = knn_model.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)

In [None]:
#Trying to check overfiitng by predicting training set

y_pred=knn_model.predict(X_train)
class_report=classification_report(y_train,y_pred)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_train, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train,y_pred))

## <u> SVM

In [None]:
svc = SVC(random_state = 42, probability = True)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob = svc.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)

## <u> Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state = 42, max_depth = 5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:,1]

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob = rf.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)

In [None]:
# Feature importance
f_imp3 = pd.DataFrame(columns = ['feature', 'importance'], index = range(15))
for i in range(len(f_imp3.index)):
    f_imp3.iloc[i, 0] = X_train.columns.to_list()[i]
f_imp3['importance'] = rf.feature_importances_
f_imp3 = f_imp3.sort_values('importance', ascending = False)
f_imp3[0:12].style.background_gradient(cmap = 'Blues')

In [None]:
from sklearn.inspection import (partial_dependence, 
                                PartialDependenceDisplay)
var = 'age'
PartialDependenceDisplay.from_estimator(rf, X_train, [var]);
PartialDependenceDisplay.from_estimator(rf, X_train, ['bmi']);

In [None]:
PartialDependenceDisplay.from_estimator(rf, X_train, ['avg_glucose_level']);

In [None]:
from sklearn.model_selection import RandomizedSearchCV


# Define the parameter distributions for hyperparameter tuning
param_distributions = {
    'n_estimators': np.arange(10, 200, 10),  # Number of trees in the forest
    'max_depth': [None] + list(np.arange(5, 30, 5)),  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(rf, param_distributions, n_iter=100, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameter values
best_params = random_search.best_params_
print("Best Hyperparameters: ", best_params)


In [None]:
# Create a new instance of RandomForestClassifier with the best hyperparameter values
best_rf = RandomForestClassifier(random_state=42,
                                n_estimators=best_params['n_estimators'],
                                max_depth=best_params['max_depth'],
                                min_samples_split=best_params['min_samples_split'],
                                min_samples_leaf=best_params['min_samples_leaf'],
                                max_features=best_params['max_features'])

# Fit the best_rf to the training data
best_rf.fit(X_train, y_train)

# Predict with the best_rf
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:,1]

# Evaluate the performance of the tuned model
# Add your evaluation code here

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
# Feature importance
f_imp3 = pd.DataFrame(columns = ['feature', 'importance'], index = range(15))
for i in range(len(f_imp3.index)):
    f_imp3.iloc[i, 0] = X_train.columns.to_list()[i]
f_imp3['importance'] = best_rf.feature_importances_
f_imp3 = f_imp3.sort_values('importance', ascending = False)
f_imp3[0:12].style.background_gradient(cmap = 'Blues')

## <u>XGBoost

In [None]:
xgb = XGBClassifier(random_state = 42, max_depth = 5, objective = 'binary:logistic', eval_metric = 'logloss')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob =xgb.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)

In [None]:
# Feature importance
f_imp3 = pd.DataFrame(columns = ['feature', 'importance'], index = range(15))
for i in range(len(f_imp3.index)):
    f_imp3.iloc[i, 0] = X_train.columns.to_list()[i]
f_imp3['importance'] = xgb.feature_importances_
f_imp3 = f_imp3.sort_values('importance', ascending = False)
f_imp3[0:12].style.background_gradient(cmap = 'Blues')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import numpy as np

# Define the parameter distribution for randomized search
param_dist = {
    'max_depth': np.arange(3, 7),  # Example values for max_depth
    'learning_rate': np.logspace(-3, 0, num=100),  # Example values for learning_rate
    'n_estimators': np.arange(100, 400, 50),  # Example values for n_estimators
    # Add more hyperparameters to tune as needed
}

# Initialize the XGBClassifier model
#xgb = XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='logloss')

# Perform Randomized Search Cross Validation (RandomizedSearchCV) for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, cv=5,
                                   n_iter=10, random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameters from the RandomizedSearchCV
best_params = random_search.best_params_
print("Best Hyperparameters: ", best_params)

In [None]:
# Train the XGBClassifier model with the best hyperparameters
best_xgb = XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='logloss',
                         max_depth=best_params['max_depth'],
                         learning_rate=best_params['learning_rate'],
                         n_estimators=best_params['n_estimators'])
best_xgb.fit(X_train, y_train)

# Make predictions with the tuned model
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

# Evaluate the performance of the tuned model
# Add your evaluation code here

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

## <u> AdaBoost Classifier 

In [None]:
Ada = AdaBoostClassifier(n_estimators=2000, random_state = 42)
Ada.fit(X_train, y_train)
y_pred = Ada.predict(X_test)
y_prob = Ada.predict_proba(X_test)[:,1]

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob =Ada.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)

## <u> CatBoost Classifier

In [None]:
cat = CatBoostClassifier(logging_level='Silent')
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)
y_prob = cat.predict_proba(X_test)[:,1]

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,y_pred))

In [None]:
###Using the function that we defined above to create ROC and confusion matrix

y_prob =cat.predict_proba(X_test)[:,1]
plot_ROC(y_test, y_prob)
plot_confusion_matrix(y_test, y_pred)