In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.pipeline import Pipeline # Pipelines
from sklearn.compose import ColumnTransformer
# Model 
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv(r'E:\ML projects\LoanPredictionBasedOnCustomerBehavior\notebooks\data\Training Data.csv')
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [3]:
df.shape

(252000, 13)

In [4]:
df['Risk_Flag'].value_counts()## Imbalanced data

Risk_Flag
0    221004
1     30996
Name: count, dtype: int64

In [5]:
X  = df.drop(labels=['Risk_Flag'], axis=1)
Y  = df['Risk_Flag']
X = X[X.columns[~X.columns.isin(['Customer_ID'])]] # Drop Customer_ID
X = X[:100000] # For testing purpose
Y = Y[:100000] # For testing purpose
X.shape, Y.shape    

((100000, 12), (100000,))

In [6]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [7]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler())
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ("OneHotEncoder",OneHotEncoder(sparse=False,drop='first')),
          ('scalar',StandardScaler())
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)   

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80000, 12), (20000, 12), (80000,), (20000,))

In [10]:
X_train = pd.DataFrame(proccessor.fit_transform(X_train), columns=proccessor.get_feature_names_out())
X_test = pd.DataFrame(proccessor.transform(X_test), columns= proccessor.get_feature_names_out())



In [11]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

In [12]:
X_resampled_smote.shape, y_resampled_smote.shape, X_test.shape, y_test.shape

((139098, 404), (139098,), (20000, 404), (20000,))

In [13]:
## Evaluating the Model
def evaluate_classification_model(true, predicted, prob_predictions=None):
    # Accuracy
    accuracy = accuracy_score(true, predicted)
    
    # Precision
    precision = precision_score(true, predicted)
    
    # Recall
    recall = recall_score(true, predicted)
    
    # F1-Score
    f1 = f1_score(true, predicted)
    
    # Confusion Matrix
    cm = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1,cm


In [14]:
from sklearn.model_selection import GridSearchCV         
## Define models
models = {
                   "LogisticRegression": LogisticRegression(),
                   "LogisticRegressionCV": LogisticRegressionCV(cv=5),
                   "DecisionTreeClassifier": DecisionTreeClassifier(),
                   "RandomForestClassifier": RandomForestClassifier(),
                   "ExtratreesClassifier": ExtraTreesClassifier(),
                   "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),
                   "AdaBoostClassifier": AdaBoostClassifier(),
                   "GradientBoostingClassifier": GradientBoostingClassifier(),
                   "SVC": SVC(),
                   "XGBClassifier": XGBClassifier(),
                   "LGBMClassifier": LGBMClassifier(),
                   "CatBoostClassifier": CatBoostClassifier(silent=True)
}

            ## Define hyperparameter grids for each model
param_grids = {
                "LogisticRegression": {
                    'C': [0.1, 1, 10],
                    'solver': ['lbfgs', 'liblinear'],
                    'max_iter': [100, 200, 300]
                },
                "KNeighborsClassifier": {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree']
                },
                "AdaBoostClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 1]
                },
                "GradientBoostingClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.5],
                    'max_depth': [3, 5, 7]
                },
                "SVC": {
                    'C': [0.1, 1, 10],
                    'kernel': ['linear', 'rbf', 'poly'],
                    'gamma': ['scale', 'auto']
                },
                "XGBClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "LGBMClassifier": {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                "CatBoostClassifier": {
                    'iterations': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'depth': [3, 5, 7]
                },
                 "LogisticRegressionCV": {
                    'Cs': [0.1, 1, 10],
                     'cv': [3, 5]
            },
                "DecisionTreeClassifier": {
                    'criterion': ['gini', 'entropy'],
                    'splitter': ['best', 'random'],
                    'max_depth': [3, 5, 7]
                },
                "RandomForestClassifier": {
                    'n_estimators': [50, 100, 200],
                    'criterion': ['gini', 'entropy'],
                    'max_depth': [3, 5, 7]
                },
                "DecisionTreeClassifier": {
                    'max_depth': [10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                },
                "RandomForestClassifier": {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                },
                "ExtratreesClassifier": {
                    'n_estimators': [50, 100],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                },
            }

train_model_list = []
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    print(model)
    print("="*35)
    param_grid = param_grids[list(models.keys())[i]]
    grid_search = GridSearchCV(model, param_grid, cv=2, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_resampled_smote, y_resampled_smote)

    model.set_params(**grid_search.best_params_)
    model.fit(X_resampled_smote, y_resampled_smote)  

    # Make Prediction

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1,cm =  evaluate_classification_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("Accuracy", accuracy)
    print("Precision", precision)
    print("Recall score", recall)
    print("f1_score", f1)
    print("Confusion Matrix", cm)


    accuracy_list.append(accuracy)

    print("="*35)

    print("\n")

            

LogisticRegression()
Fitting 2 folds for each of 18 candidates, totalling 36 fits
LogisticRegression
Model Training Performance
Accuracy 0.60595
Precision 0.19286125503742085
Recall score 0.6578947368421053
f1_score 0.2982815421600926
Confusion Matrix [[10444  7010]
 [  871  1675]]


LogisticRegressionCV(cv=5)
Fitting 2 folds for each of 6 candidates, totalling 12 fits


4 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "e:\ML projects\AutomobileLoanDefaultPrediction\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\ML projects\AutomobileLoanDefaultPrediction\venv\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\ML projects\AutomobileLoanDefaultPrediction\venv\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\ML projects\AutomobileLoanDefaultPrediction\venv\lib\site-packages\

LogisticRegressionCV
Model Training Performance
Accuracy 0.60655
Precision 0.1930573174950986
Recall score 0.6575019638648861
f1_score 0.29847552821610057
Confusion Matrix [[10457  6997]
 [  872  1674]]


DecisionTreeClassifier()
Fitting 2 folds for each of 27 candidates, totalling 54 fits
DecisionTreeClassifier
Model Training Performance
Accuracy 0.7142
Precision 0.28656073256127124
Recall score 0.835820895522388
f1_score 0.4267950260730045
Confusion Matrix [[12156  5298]
 [  418  2128]]


RandomForestClassifier()
Fitting 2 folds for each of 18 candidates, totalling 36 fits
RandomForestClassifier
Model Training Performance
Accuracy 0.93885
Precision 0.7071719386157219
Recall score 0.8868813825608798
f1_score 0.7868966718940582
Confusion Matrix [[16519   935]
 [  288  2258]]


ExtraTreesClassifier()
Fitting 2 folds for each of 24 candidates, totalling 48 fits
ExtratreesClassifier
Model Training Performance
Accuracy 0.93535
Precision 0.6815415821501014
Recall score 0.9238020424194815
f1

KeyboardInterrupt: 