# Importing Libraries

In [None]:
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn import metrics
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Creating 5 <b>Stratified K Fold cross validation</b> sets for better testing

In [None]:
TRAINING_PATH='../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'

df=pd.read_csv(TRAINING_PATH)
df.head()

In [None]:
TRAINING_FOLDS_PATH='./'

import pandas as pd

df_train=pd.read_csv(TRAINING_PATH)
df_train.head()

df_train['stroke'].value_counts()

df_train['kfolds']=-1
df_train=df_train.sample(frac=1).reset_index(drop=True)
df_train.head()

from sklearn import model_selection

strat_kf=model_selection.StratifiedKFold(n_splits=5)

for fold,(trn_,val_) in enumerate(strat_kf.split(X=df_train,y=df_train['stroke'])):
  df_train.loc[val_,'kfolds']=fold
df_train.head()

df_train.to_csv(TRAINING_FOLDS_PATH+'train_folds.csv')

# ðŸ“‹<b>Data Exploration</b>


In [None]:
TRAINING_PATH='./train_folds.csv'
MODEL_PATH='./'
SUBMISSION_FILES_PATH='./Submissions/'

In [None]:
df=pd.read_csv(TRAINING_PATH)
df.head()

In [None]:
df.describe()

## Handling Null Values

In [None]:
df.isna().sum()

In [None]:
len(df)

### As we can see that BMI has 201 null values out of a total of 5110 which need to be handled. Null values can be handled by either dropping those rows entirely or by replacing them with a constant value like mean, median or mode. I proceeded with replacing the NaNs with the mean value.

In [None]:
df['bmi'].fillna(int(df['bmi'].mean()),inplace=True)

In [None]:
df.isna().sum()

In [None]:
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

In [None]:
# Total number of unique values in each column
df.nunique()

## Scaling the Continuous Variables using <b>MinMaxScaler</b>

In [None]:
# Checking for any numerical data. If present, it has to be scaled etc.

columns = df.columns
numerical_columns = df._get_numeric_data().columns
numerical_columns

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df_2 = df.loc[:,['age','avg_glucose_level','bmi']]
df.loc[:,df_2.columns] = pd.DataFrame(scaler.fit_transform(df_2),index=df.index,columns=df_2.columns)
df.head()

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="smoking_status", data=df)

## Handling the Categorical Features

### 1. Features like smoking_status, gender, work_type and Residence_type have to be <b>One Hot Encoded</b> since they don't denote any Ordinal data (ie. the values don't denote any rank)
### 2. The ever_married feature can be <b>Label Encoded</b> as the value is either Yes or No so we can encode them as 1 and 0 respectively.

In [None]:
# One hot encode the categorical columns - smoking_status

df=pd.get_dummies(data=df,columns=['smoking_status','gender','work_type','Residence_type'])
df.head()

In [None]:
# Move the target and kfolds column to the last

df=df[[column for column in df if column not in['stroke','kfolds']]+['stroke','kfolds']]
df.head()

In [None]:
df['ever_married'] = df['ever_married'].replace({'No':0,'Yes':1})
df.head()

# âœ… Let's choose the optimal features from the dataset using some <b>Feature Selection</b> techniques

### <b>Greedy Feature Selection</b> - After choosing a model and scoring function (here, accuracy); we take a feature iteratively and if that feature improves the score then only it is kept in our optimal feature dataset. Hence, the optimal dataset can be different for different models.

### <b>Models Considered:</b>
### 1. XGBoost Classifier
### 2. Random Forest Classifier
### 3. Decision Tree Classifier
#### Note: SVM Classifier was taking time for feature Selection so the entire dataset was considered the optimal dataset for SVM

In [None]:
def run(fold,df,models,target_name, save_model, print_details=False):
  
  # print(df.head())
  # Training and validation sets
  df_train=df[df['kfolds']!=fold].reset_index(drop=True)
  df_valid=df[df['kfolds']==fold].reset_index(drop=True)


  # x and y of training dataset
  x_train=df_train.drop(target_name,axis=1).values
  y_train=df_train[target_name].values

  # x and y of validation dataset
  x_valid=df_valid.drop(target_name,axis=1).values
  y_valid=df_valid[target_name].values

  # accuracy => will store accuracies of the models  (same for confusion_matrices)
  accuracy=[]
  confusion_matrices=[]
  classification_report=[]

  for model_name,model_constructor in list(models.items()):
    clf=model_constructor
    clf.fit(x_train,y_train)

    # preds_train, preds_valid => predictions when training and validation x are fed into the trained model
    preds_train=clf.predict(x_train)
    preds_valid=clf.predict(x_valid)

    acc_train=metrics.accuracy_score(y_train,preds_train)
    acc_valid=metrics.accuracy_score(y_valid,preds_valid)

    f1_train = metrics.f1_score(y_train,preds_train)
    f1_valid = metrics.f1_score(y_valid,preds_valid)

    conf_matrix=metrics.confusion_matrix(y_valid,preds_valid)
    class_report=metrics.classification_report(y_valid,preds_valid)

    accuracy.append(acc_valid)
    confusion_matrices.append(conf_matrix)
    classification_report.append(class_report)

    if(print_details==True):
      print(f'Model => {model_name} => Fold = {fold} => Training Accuracy = {acc_train} => Validation Accuracy = {acc_valid}')

    if(save_model==True):
      joblib.dump(clf, f"{MODEL_PATH}{model_name}_F1_{f1_valid}_ACC_{acc_valid}_FOLD_{fold}.bin")

  if(print_details==True):
    print('\n--------------------------------------------------------------------------------------------\n')
    
  return accuracy,confusion_matrices,classification_report

In [None]:
def greedy_feature_selection(fold,df,models,target_name):

  # target_index => stores the index of the target variable in the dataset
  # kfolds_index => stores the index of kfolds column in the dataset

  target_index=df.columns.get_loc(target_name)
  kfolds_index=df.columns.get_loc('kfolds')

  # good_features => stores the indices of all the optimal features
  # best_scores => keeps track of the best scores 
  good_features=[]
  best_scores=[]

  # df has X and y and a kfolds column. 
  # no of features (no of columns in X) => total columns in df - 1 (there's 1 y) - 1 (there's 1 kfolds)
  num_features=df.shape[1]-2

  while True:

    # this_feature => the feature added to the already selected features to measure the effect of the former on the model
    # best_score => keeps track of the best score achieved while selecting features 1 at a time and checking its effect on the model
    this_feature=None
    best_score=0


    for feature in range(num_features):

      # if the feature is already in the good_features list, ignore and move ahead
      if feature in good_features:
        continue
      
      # add the currently selected feature to the already discovered good features
      selected_features=good_features+[feature]

      # all the selected features + target and kfolds column
      df_train=df.iloc[:, selected_features + [target_index,kfolds_index]]

      # fit the selected dataset to a model 
      accuracy,confusion_matrices,classification_report=run(fold,df_train,models,save_model= False, target_name=target_name)

      # if any improvement is observed over the previous set of features
      if(accuracy[0]>best_score):
        this_feature=feature
        best_score=accuracy[0]
      
    if(this_feature!=None):
      good_features.append(this_feature)
      best_scores.append(best_score)
    
    if(len(best_scores)>2):
      if(best_scores[-1]<best_scores[-2]):
        break
    
  return best_scores[:-1] , df.iloc[:, good_features[:-1] + [target_index,kfolds_index]]

### 1. XGBoost Classifier

In [None]:
print('Greedy Feature Selection : ')
print('\n')
models={'XGB': XGBClassifier()}
best_scores,df_optimal_XGB=greedy_feature_selection(fold=4,df=df,models=models,target_name='stroke')
print(df_optimal_XGB.head())

### 2. Random Forest Classifier

In [None]:
models={'RFC' : RandomForestClassifier()}
best_scores,df_optimal_RFC=greedy_feature_selection(fold=4,df=df,models=models,target_name='stroke')
print(df_optimal_RFC.head())


### 3. Decision Tree Classifier

In [None]:
models={'DT' : DecisionTreeClassifier()}
best_scores,df_optimal_DT=greedy_feature_selection(fold=4,df=df,models=models,target_name='stroke')
print(df_optimal_DT.head())

# ðŸ”Ž Finding the best hyperparameters for the models using Optuna Library for <b>Hyperparameter Tuning</b>

### <b>Models Considered:</b>
### 1. XGBoost Classifier
### 2. Random Forest Classifier
### 3. Decision Tree Classifier
### 4. SVM Classifier
### -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### 1. Random Forest Classifier

In [None]:
import optuna
from functools import partial

def optimize_rfc(trial,df,total_folds,target_name):
    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    max_features = trial.suggest_uniform("max_features", 0.01, 1.0)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    
    model = RandomForestClassifier(
        n_estimators = n_estimators, 
        max_depth = max_depth, 
        max_features = max_features, 
        min_samples_leaf = min_samples_leaf,
        min_samples_split = min_samples_split,
        criterion = criterion
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_rfc = partial(optimize_rfc, df = df_optimal_RFC, total_folds = 5,target_name = 'stroke')
study_rfc = optuna.create_study(direction = 'maximize')
study_rfc.optimize(optimization_function_rfc, n_trials=15)

In [None]:
rfc_best_params = study_rfc.best_trial.params
rfc_best_params

### 2. XGBoost Classifier

In [None]:
def optimize_xgb(trial,df,total_folds,target_name):
    
    learning_rate = trial.suggest_uniform("learning_rate", 0.01, 1.0)
    gamma = trial.suggest_uniform("gamma", 0.05, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    subsample = trial.suggest_uniform("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.5, 1.0)
    reg_lambda = trial.suggest_uniform("reg_lambda", 0.01, 1.0)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.01, 1.0)
    
    model = XGBClassifier(
        learning_rate = learning_rate,
        gamma = gamma,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_lambda = reg_lambda,
        reg_alpha = reg_alpha
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_xgb = partial(optimize_xgb, df = df_optimal_XGB, total_folds = 5,target_name = 'stroke')
study_xgb = optuna.create_study(direction = 'maximize')
study_xgb.optimize(optimization_function_xgb, n_trials=15)

In [None]:
xgb_best_params = study_xgb.best_trial.params
xgb_best_params

### 3. SVM Classifier

In [None]:
def optimize_svc(trial,df,total_folds,target_name):
    
    C = trial.suggest_uniform("C", 0.001, 1000)
    gamma = trial.suggest_categorical("gamma", ['auto'])
    class_weight = trial.suggest_categorical("class_weight", ['balanced'])
    
    model = SVC(
        C = C,
        gamma = gamma,
        class_weight = class_weight
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_svc = partial(optimize_svc, df = df, total_folds = 5,target_name = 'stroke')
study_svc = optuna.create_study(direction = 'maximize')
study_svc.optimize(optimization_function_svc, n_trials=15)

In [None]:
svc_best_params = study_svc.best_trial.params
svc_best_params

### 4. Decision Tree Classifier

In [None]:
def optimize_dt(trial,df,total_folds,target_name):
    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
    max_depth = trial.suggest_int("max_depth", 3, 30)
    max_features = trial.suggest_uniform("max_features", 0.01, 1.0)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    
    model = DecisionTreeClassifier(
        max_depth = max_depth, 
        max_features = max_features, 
        min_samples_leaf = min_samples_leaf,
        min_samples_split = min_samples_split,
        criterion = criterion
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_dt = partial(optimize_dt, df = df_optimal_DT, total_folds = 5,target_name = 'stroke')
study_dt = optuna.create_study(direction = 'maximize')
study_dt.optimize(optimization_function_dt, n_trials=15)

In [None]:
dt_best_params = study_dt.best_trial.params
dt_best_params

# Now its time to Run the Models with their best hyperparameters!

In [None]:
XGB_model=XGBClassifier(**xgb_best_params)
SVM_model=SVC(**svc_best_params)
RFC_model=RandomForestClassifier(**rfc_best_params)
DT_model=DecisionTreeClassifier(**dt_best_params)

In [None]:
models={
    'Random Forest Classifier' : RFC_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df_optimal_RFC,models=models,target_name='stroke', save_model= True, print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

In [None]:
models={
    'Decision Tree Classifier' : DT_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df_optimal_DT,models=models,target_name='stroke', save_model= True, print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

In [None]:
models={
    'SVM Classifier' : SVM_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df,models=models,target_name='stroke', save_model= True, print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

In [None]:
models={
    'XGB Classifier' : XGB_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df_optimal_XGB,models=models,target_name='stroke', save_model= True, print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

# Conclusion:

## Random Forest, Decision Tree and XGBoost Classifier all performed relatively same with their accuracies being 95% for both train and validation set. Hence no overfitting was observed.
## SVM had an accuracy of 94% on the validation set. This (slightly lower accuracy than the other 3 models) could be attributed to the fact that feature selection could not be applied because of time constraint. However, an important thing to note is that there was a bit of overfitting here as training accuracy was surprisingly 100% but validation accuracy was 94%. 

## If you like my work, an upvote would be great!