In [33]:
import pandas as pd
import numpy as np

In [34]:
train_df = pd.read_csv(r'E:\ML projects\AutomobileLoanDefaultPrediction\artifacts\data_preprocessing\preprocessed_train.csv')
test_df = pd.read_csv(r'E:\ML projects\AutomobileLoanDefaultPrediction\artifacts\data_preprocessing\preprocessed_test.csv')

In [35]:
train_df.shape, test_df.shape

((82640, 24), (36080, 24))

In [36]:
X_train, y_train, X_test, y_test = train_df.drop(['Default'], axis=1), train_df['Default'], test_df.drop(['Default'], axis=1), test_df['Default']

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((82640, 23), (36080, 23), (82640,), (36080,))

In [38]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.pipeline import Pipeline # Pipelines
from sklearn.compose import ColumnTransformer
# Model 
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [39]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X_train.select_dtypes(exclude="object").columns
categorical_columns = X_train.select_dtypes(include="object").columns

In [40]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler())
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ("OneHotEncoder",OneHotEncoder(sparse=False,drop='first')),
          ('scalar',StandardScaler())
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [41]:
# apply label encoder to dependent variable
encoded_y = LabelEncoder()
encoded_y_train = encoded_y.fit_transform(y_train)
encoded_y_test = encoded_y.transform(y_test)


In [42]:
encoded_y_test.shape, encoded_y_train.shape

((36080,), (82640,))

In [47]:
type(encoded_y_train)

numpy.ndarray

In [43]:
X_train = proccessor.fit_transform(X_train)
X_test = proccessor.transform(X_test)



In [44]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, encoded_y_train)
X_test, y_test = smote.fit_resample(X_test, encoded_y_test)

In [45]:
X_resampled_smote.shape, y_resampled_smote.shape, X_test.shape, y_test.shape

((151872, 38), (151872,), (66358, 38), (66358,))

In [46]:
type(X_resampled_smote)

numpy.ndarray

In [50]:
train_arr = np.c_[X_resampled_smote, np.array(y_resampled_smote)]
test_arr = np.c_[X_test, np.array(y_test)]

In [51]:
train_arr.shape, test_arr.shape

((151872, 39), (66358, 39))

In [52]:
type(train_arr)

numpy.ndarray

In [48]:
## Evaluating the Model
def evaluate_classification_model(true, predicted, prob_predictions=None):
    # Accuracy
    accuracy = accuracy_score(true, predicted)
    
    # Precision
    precision = precision_score(true, predicted)
    
    # Recall
    recall = recall_score(true, predicted)
    
    # F1-Score
    f1 = f1_score(true, predicted)
    
    # Confusion Matrix
    cm = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1,cm


In [54]:
from sklearn.model_selection import GridSearchCV         
## Define models
models = {
                   "LogisticRegression": LogisticRegression(),
                   "LogisticRegressionCV": LogisticRegressionCV(cv=5),
                   "DecisionTreeClassifier": DecisionTreeClassifier(),
                   "RandomForestClassifier": RandomForestClassifier(),
                   "ExtratreesClassifier": ExtraTreesClassifier(),
                   "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),
                   "AdaBoostClassifier": AdaBoostClassifier(),
                   "GradientBoostingClassifier": GradientBoostingClassifier(),
                   "SVC": SVC(),
                   "XGBClassifier": XGBClassifier(),
                   "LGBMClassifier": LGBMClassifier(),
                   "CatBoostClassifier": CatBoostClassifier(silent=True)
}

            ## Define hyperparameter grids for each model
param_grids = {
                "LogisticRegression": {
                    'C': [0.1, 1, 10],
                    'solver': ['lbfgs', 'liblinear'],
                    'max_iter': [100, 200, 300]
                },
                "KNeighborsClassifier": {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree']
                },
                "AdaBoostClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 1]
                },
                "GradientBoostingClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.5],
                    'max_depth': [3, 5, 7]
                },
                "SVC": {
                    'C': [0.1, 1, 10],
                    'kernel': ['linear', 'rbf', 'poly'],
                    'gamma': ['scale', 'auto']
                },
                "XGBClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "LGBMClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "CatBoostClassifier": {
                    'iterations': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'depth': [3, 5, 7]
                },
                 "LogisticRegressionCV": {
                    'Cs': [0.1, 1, 10],
                     'cv': [3, 5]
            },
                "DecisionTreeClassifier": {
                    'max_depth': [10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                },
                "RandomForestClassifier": {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                },
                "ExtratreesClassifier": {
                    'n_estimators': [50, 100],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                },
            }

train_model_list = []
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    param_grid = param_grids[list(models.keys())[i]]
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_resampled_smote, y_resampled_smote)

    model.set_params(**grid_search.best_params_)
    model.fit(X_resampled_smote, y_resampled_smote)  

    # Make Prediction

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1,cm =  evaluate_classification_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("Accuracy", accuracy)
    print("Precision", precision)
    print("Recall score", recall)
    print("f1_score", f1)
    print("Confusion Matrix", cm)


    accuracy_list.append(accuracy)

    print("="*35)

    print("\n")

            

Fitting 3 folds for each of 18 candidates, totalling 54 fits
LogisticRegression
Model Training Performance
Accuracy 0.6490852647759124
Precision 0.6490403446925186
Recall score 0.6492359625064047
f1_score 0.6491381388621023
Confusion Matrix [[21531 11648]
 [11638 21541]]


Fitting 3 folds for each of 6 candidates, totalling 18 fits


6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\ML projects\BrainStrokePrediction\venv\lib\site-packages\sklearn\utils\_param_validation.py", lin

LogisticRegressionCV
Model Training Performance
Accuracy 0.6490249856837156
Precision 0.6490070216677214
Recall score 0.6490852647759124
f1_score 0.6490461408637473
Confusion Matrix [[21532 11647]
 [11643 21536]]


Fitting 3 folds for each of 27 candidates, totalling 81 fits
DecisionTreeClassifier
Model Training Performance
Accuracy 0.8625335302450345
Precision 0.9038171014200825
Recall score 0.8114168600620875
f1_score 0.8551281644061874
Confusion Matrix [[30314  2865]
 [ 6257 26922]]


Fitting 3 folds for each of 18 candidates, totalling 54 fits


KeyboardInterrupt: 