In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.pipeline import Pipeline # Pipelines
from sklearn.compose import ColumnTransformer
# Model 
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv("data/cleaned_data.csv")
df.head()

Unnamed: 0,Client_Income,Car_Owned,Bike_Owned,Active_Loan,House_Own,Child_Count,Client_Education,Client_Marital_Status,Client_Gender,Loan_Contract_Type,...,Workphone_Working,Application_Process_Day,Application_Process_Hour,Client_Permanent_Match_Tag,Client_Contact_Work_Tag,Type_Organization,Score_Source_2,Phone_Change,Credit_Bureau,Default
0,6750.0,no,no,yes,no,0.0,Secondary,M,Male,CL,...,no,6.0,17.0,Yes,Yes,Public Sector,0.478787,63.0,1.0,no
1,20250.0,yes,no,yes,yes,0.0,Graduation,M,Male,CL,...,yes,3.0,10.0,Yes,Yes,Public Sector,0.215068,755.0,1.0,no
2,18000.0,no,no,yes,no,1.0,Graduation dropout,W,Male,CL,...,no,4.0,12.0,Yes,Yes,Public Sector,0.552795,277.0,0.0,no
3,15750.0,no,no,yes,yes,0.0,Secondary,M,Male,CL,...,no,2.0,15.0,Yes,Yes,Unknown,0.135182,1700.0,3.0,no
4,33750.0,yes,no,yes,no,2.0,Secondary,M,Female,CL,...,no,3.0,12.0,Yes,Yes,Business,0.301182,674.0,1.0,no


In [3]:
X  = df.drop(labels=['Default'], axis=1)
Y  = df['Default']

In [4]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [5]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler())
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ("OneHotEncoder",OneHotEncoder(sparse=False,drop='first')),
          ('scalar',StandardScaler())
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [6]:
# apply label encoder to dependent variable
encoded_y = LabelEncoder()
encoded_y = encoded_y.fit_transform(Y)

In [7]:
encoded_y.shape

(121856,)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.2, random_state=42)   

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((97484, 23), (24372, 23), (97484,), (24372,))

In [10]:
X_train = pd.DataFrame(proccessor.fit_transform(X_train), columns=proccessor.get_feature_names_out())
X_test = pd.DataFrame(proccessor.transform(X_test), columns= proccessor.get_feature_names_out())



In [11]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)
X_test, y_test = smote.fit_resample(X_test, y_test)

In [12]:
X_resampled_smote.shape, y_resampled_smote.shape, X_test.shape, y_test.shape

((179172, 38), (179172,), (44850, 38), (44850,))

In [13]:
## Evaluating the Model
def evaluate_classification_model(true, predicted, prob_predictions=None):
    # Accuracy
    accuracy = accuracy_score(true, predicted)
    
    # Precision
    precision = precision_score(true, predicted)
    
    # Recall
    recall = recall_score(true, predicted)
    
    # F1-Score
    f1 = f1_score(true, predicted)
    
    # Confusion Matrix
    cm = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1,cm


In [15]:
from sklearn.model_selection import GridSearchCV         
## Define models
models = {
                   "LogisticRegression": LogisticRegression(),
                   "LogisticRegressionCV": LogisticRegressionCV(cv=5),
                   "DecisionTreeClassifier": DecisionTreeClassifier(),
                   "RandomForestClassifier": RandomForestClassifier(),
                   "ExtratreesClassifier": ExtraTreesClassifier(),
                   "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),
                   "AdaBoostClassifier": AdaBoostClassifier(),
                   "GradientBoostingClassifier": GradientBoostingClassifier(),
                   "SVC": SVC(),
                   "XGBClassifier": XGBClassifier(),
                   "LGBMClassifier": LGBMClassifier(),
                   "CatBoostClassifier": CatBoostClassifier(silent=True)
}

            ## Define hyperparameter grids for each model
param_grids = {
                "LogisticRegression": {
                    'C': [0.1, 1, 10],
                    'solver': ['lbfgs', 'liblinear'],
                    'max_iter': [100, 200, 300]
                },
                "KNeighborsClassifier": {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree']
                },
                "AdaBoostClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 1]
                },
                "GradientBoostingClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.5],
                    'max_depth': [3, 5, 7]
                },
                "SVC": {
                    'C': [0.1, 1, 10],
                    'kernel': ['linear', 'rbf', 'poly'],
                    'gamma': ['scale', 'auto']
                },
                "XGBClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "LGBMClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "CatBoostClassifier": {
                    'iterations': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'depth': [3, 5, 7]
                },
                 "LogisticRegressionCV": {
                    'Cs': [0.1, 1, 10],
                     'cv': [3, 5]
            },
                "DecisionTreeClassifier": {
                    'criterion': ['gini', 'entropy'],
                    'splitter': ['best', 'random'],
                    'max_depth': [3, 5, 7]
                },
                "RandomForestClassifier": {
                    'n_estimators': [50, 100, 200],
                    'criterion': ['gini', 'entropy'],
                    'max_depth': [3, 5, 7]
                },
                "DecisionTreeClassifier": {
                    'max_depth': [10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                },
                "RandomForestClassifier": {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                },
                "ExtratreesClassifier": {
                    'n_estimators': [50, 100],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                },
            }

train_model_list = []
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    param_grid = param_grids[list(models.keys())[i]]
    grid_search = GridSearchCV(model, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_resampled_smote, y_resampled_smote)

    model.set_params(**grid_search.best_params_)
    model.fit(X_resampled_smote, y_resampled_smote)  

    # Make Prediction

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1,cm =  evaluate_classification_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("Accuracy", accuracy)
    print("Precision", precision)
    print("Recall score", recall)
    print("f1_score", f1)
    print("Confusion Matrix", cm)


    accuracy_list.append(accuracy)

    print("="*35)

    print("\n")

            

Fitting 2 folds for each of 18 candidates, totalling 36 fits
LogisticRegression
Model Training Performance
Accuracy 0.6562095875139353
Precision 0.6547534900159039
Recall score 0.6609141583054626
f1_score 0.6578194003683895
Confusion Matrix [[14610  7815]
 [ 7604 14821]]


Fitting 2 folds for each of 6 candidates, totalling 12 fits


4 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\utils\_param_validation.py", lin

LogisticRegressionCV
Model Training Performance
Accuracy 0.6562318840579711
Precision 0.6547824166114424
Recall score 0.6609141583054626
f1_score 0.6578339991122946
Confusion Matrix [[14611  7814]
 [ 7604 14821]]


Fitting 2 folds for each of 27 candidates, totalling 54 fits
DecisionTreeClassifier
Model Training Performance
Accuracy 0.8696098104793757
Precision 0.9067576188840359
Recall score 0.8239464882943144
f1_score 0.8633708705200691
Confusion Matrix [[20525  1900]
 [ 3948 18477]]


Fitting 2 folds for each of 18 candidates, totalling 36 fits
RandomForestClassifier
Model Training Performance
Accuracy 0.9177480490523969
Precision 0.9912943150828614
Recall score 0.8428985507246377
f1_score 0.9110934374472803
Confusion Matrix [[22259   166]
 [ 3523 18902]]


Fitting 2 folds for each of 24 candidates, totalling 48 fits
ExtratreesClassifier
Model Training Performance
Accuracy 0.8487848383500557
Precision 0.9882639365753169
Recall score 0.705953177257525
f1_score 0.823587555925502
Confu

KeyboardInterrupt: 