## Model Training
- In this notebook, the three models will be trained with the already created dataset.

### Import Libraries
- Import all essential libraries

In [31]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')

### Data Loading From csv
- This sections Loads train and test data for Trainig and Evaluation

In [2]:
#Consumer Loan
consumer_loan_train = pd.read_csv('processed_dataset_to_train/consumer_loan_train.csv', header=None)
consumer_loan_train_classification = pd.read_csv('processed_dataset_to_train/consumer_loan_revenue_train_classification.csv', header=None)
consumer_loan_train_revenue = pd.read_csv('processed_dataset_to_train/consumer_loan_revenue_train_revenue.csv', header=None)
consumer_loan_test = pd.read_csv('processed_dataset_to_train/consumer_loan_test.csv', header=None)
consumer_loan_test_classification = pd.read_csv('processed_dataset_to_train/consumer_loan_revenue_test_classification.csv', header=None)
consumer_loan_test_revenue = pd.read_csv('processed_dataset_to_train/consumer_loan_revenue_test_revenue.csv', header=None)
#Convert to XGBoost DMatrix
consumer_loan_train_reg = xgb.DMatrix(consumer_loan_train, consumer_loan_train_revenue)
consumer_loan_test_reg = xgb.DMatrix(consumer_loan_test, consumer_loan_test_revenue)

consumer_loan_train_cls = xgb.DMatrix(consumer_loan_train, consumer_loan_train_classification)
consumer_loan_test_cls = xgb.DMatrix(consumer_loan_test, consumer_loan_test_classification)


#Credit Card
credit_card_train = pd.read_csv('processed_dataset_to_train/credit_card_train.csv', header=None)
credit_card_train_classification = pd.read_csv('processed_dataset_to_train/credit_card_revenue_train_classification.csv', header=None)
credit_card_train_revenue = pd.read_csv('processed_dataset_to_train/credit_card_revenue_train_revenue.csv', header=None)
credit_card_test = pd.read_csv('processed_dataset_to_train/credit_card_test.csv', header=None)
credit_card_test_classification = pd.read_csv('processed_dataset_to_train/credit_card_revenue_test_classification.csv', header=None)
credit_card_test_revenue = pd.read_csv('processed_dataset_to_train/credit_card_revenue_test_revenue.csv', header=None)
#Convert to XGBoost DMatrix
credit_card_train_reg = xgb.DMatrix(credit_card_train, credit_card_train_revenue)
credit_card_test_reg = xgb.DMatrix(credit_card_test, credit_card_test_revenue)

credit_card_train_cls = xgb.DMatrix(credit_card_train, credit_card_train_classification)
credit_card_test_cls = xgb.DMatrix(credit_card_test, credit_card_test_classification)

#Mutual Fund
mutual_fund_train = pd.read_csv('processed_dataset_to_train/mutual_fund_train.csv', header=None)
mutual_fund_train_classification = pd.read_csv('processed_dataset_to_train/mutual_fund_revenue_train_classification.csv', header=None)
mutual_fund_train_revenue = pd.read_csv('processed_dataset_to_train/mutual_fund_revenue_train_revenue.csv', header=None)
mutual_fund_test = pd.read_csv('processed_dataset_to_train/mutual_fund_test.csv', header=None)
mutual_fund_test_classification = pd.read_csv('processed_dataset_to_train/mutual_fund_revenue_test_classification.csv', header=None)
mutual_fund_test_revenue = pd.read_csv('processed_dataset_to_train/mutual_fund_revenue_test_revenue.csv', header=None)
#Convert to XGBoost DMatrix
mutual_fund_train_reg = xgb.DMatrix(mutual_fund_train, mutual_fund_train_revenue)
mutual_fund_test_reg = xgb.DMatrix(mutual_fund_test, mutual_fund_test_revenue)

mutual_fund_train_cls = xgb.DMatrix(mutual_fund_train, mutual_fund_train_classification)
mutual_fund_test_cls = xgb.DMatrix(mutual_fund_test, mutual_fund_test_classification)

- def to find classification accuracy metric

In [3]:
def classification_accuracy(y_val,y_pred,y_pred_proba):
    # Compute classification metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_val, y_pred)
    # Classification Report (includes precision, recall, f1-score for each class)
    class_report = classification_report(y_val, y_pred)
    # Print the results
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print('\nConfusion Matrix:')
    print(conf_matrix)
    print('\nClassification Report:')
    print(class_report)

### Consumer Loan Model Training

##### XG Boost classifier model Training
- To classify if Consumer Loan can be sold to the Client

In [4]:
#This value will be used to tackle the unbalanced dataset.
scale_pos_weight_val = int ((consumer_loan_train_classification.shape[0]-consumer_loan_train_classification.sum())/ consumer_loan_train_classification.sum())
scale_pos_weight_val

  scale_pos_weight_val = int ((consumer_loan_train_classification.shape[0]-consumer_loan_train_classification.sum())/ consumer_loan_train_classification.sum())


2

- Model training

In [5]:
# XGBoost (different learning rate)
xgb_classifier_CL = xgb.XGBClassifier(    
objective='binary:logistic',
scale_pos_weight=scale_pos_weight_val,
n_estimators=200,       # Use a high number of trees
max_depth=4,             # Limit tree depth to prevent overfitting
min_child_weight=2,      # Prevent overly specific child nodes
subsample=0.8,           # Use 80% of data for each boosting round
colsample_bytree=0.8,    # Randomly sample features for each tree
learning_rate=0.5,      # Lower learning rate to prevent overfitting
gamma=0,                 # No additional regularization (you can experiment with this)
reg_alpha=0.1,           # L1 regularization (Lasso)
reg_lambda=1.0,          # L2 regularization (Ridge)
early_stopping_rounds=250)
xgb_classifier_CL.fit(consumer_loan_train, consumer_loan_train_classification,eval_set=[(consumer_loan_test, consumer_loan_test_classification)],verbose=False)

In [6]:
#Save Model
pickle.dump(xgb_classifier_CL, open('model/xgb_classifier_CL.pkl', "wb"))

- Classification Metrics

In [7]:
#Classification Metics
y_pred = xgb_classifier_CL.predict(consumer_loan_test)
y_pred_proba = xgb_classifier_CL.predict_proba(consumer_loan_test)[:, 1]
classification_accuracy(consumer_loan_test_classification,y_pred,y_pred_proba)

Accuracy: 0.6875
Precision: 0.2692
Recall: 0.3889
F1-score: 0.3182
ROC AUC: 0.5897

Confusion Matrix:
[[59 19]
 [11  7]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.76      0.80        78
         1.0       0.27      0.39      0.32        18

    accuracy                           0.69        96
   macro avg       0.56      0.57      0.56        96
weighted avg       0.74      0.69      0.71        96



##### XG Boost regression model Training
- Train regresson model to predict the revenue.

In [8]:
#consumer_loan_train, consumer_loan_train_revenue
#consumer_loan_test, consumer_loan_test_revenue
consumer_loan_train[9] =  consumer_loan_train_revenue[0]
consumer_loan_test[9] = consumer_loan_test_revenue[0]

##Select regrressions >0 
consumer_loan_train = consumer_loan_train[consumer_loan_train[9] > 0]
consumer_loan_test = consumer_loan_test[consumer_loan_test[9] > 0]

#Again split back to train and label
consumer_loan_train_revenue = consumer_loan_train[9]
consumer_loan_test_revenue =consumer_loan_test[9]
consumer_loan_train = consumer_loan_train.drop(columns=[9])
consumer_loan_test = consumer_loan_test.drop(columns=[9])

print(consumer_loan_train.shape)

(271, 9)


- Model training

In [9]:
# Initialize the XGBRegressor
xgb_regressor_CL = xgb.XGBRegressor(
    objective='reg:squarederror',  # Standard regression objective
    learning_rate=0.01,            # Lower learning rate to avoid overfitting
    n_estimators=500,             # Number of trees, you can increase it with lower learning rate
    max_depth=4,                   # Limit tree depth to avoid too complex models
    min_child_weight=4,            # Minimum sum of instance weight (used for pruning)
    subsample=0.8,                 # Fraction of samples used for fitting trees
    colsample_bytree=0.8,          # Fraction of features used for each tree
    gamma=0.5,                   # Fraction of features used for each tree
    early_stopping_rounds=250
)

# Train the model
xgb_regressor_CL.fit(consumer_loan_train, consumer_loan_train_revenue, eval_set=[(consumer_loan_test, consumer_loan_test_revenue)], verbose=False)

- Model evaluation accuracy

In [10]:
y_pred = xgb_regressor_CL.predict(consumer_loan_train)

# Evaluate the model
mse = mean_squared_error(consumer_loan_train_revenue, y_pred)
r2 = r2_score(consumer_loan_train_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 102.3393
R²: 0.0065


In [11]:
y_pred = xgb_regressor_CL.predict(consumer_loan_test)

# Evaluate the model
mse = mean_squared_error(consumer_loan_test_revenue, y_pred)
r2 = r2_score(consumer_loan_test_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 33.1067
R²: 0.0008


- Save model

In [12]:
pickle.dump(xgb_regressor_CL, open('model/xgb_regressor_CL.pkl', "wb"))

### Credit Card Model Training

##### XG Boost classifier model Training
- To classify if Credit Card can be sold to the Client

In [13]:
#This value will be used to tackle the unbalanced dataset.
scale_pos_weight_val = int ((credit_card_train_classification.shape[0]-credit_card_train_classification.sum())/ credit_card_train_classification.sum())
scale_pos_weight_val

  scale_pos_weight_val = int ((credit_card_train_classification.shape[0]-credit_card_train_classification.sum())/ credit_card_train_classification.sum())


3

- Model training

In [14]:
# XGBoost (different learning rate)
xgb_classifier_CC = xgb.XGBClassifier(    
objective='binary:logistic',
scale_pos_weight=scale_pos_weight_val,
n_estimators=200,       # Use a high number of trees
max_depth=4,             # Limit tree depth to prevent overfitting
min_child_weight=2,      # Prevent overly specific child nodes
subsample=0.8,           # Use 80% of data for each boosting round
colsample_bytree=0.8,    # Randomly sample features for each tree
learning_rate=0.5,      # Lower learning rate to prevent overfitting
gamma=0,                 # No additional regularization (you can experiment with this)
reg_alpha=0.1,           # L1 regularization (Lasso)
reg_lambda=1.0,          # L2 regularization (Ridge)
early_stopping_rounds=250)
xgb_classifier_CC.fit(credit_card_train, credit_card_train_classification,eval_set=[(credit_card_test, credit_card_test_classification)],verbose=False)

In [15]:
#Save Model
pickle.dump(xgb_classifier_CC, open('model/xgb_classifier_CC.pkl', "wb"))

- Classification metrics

In [16]:
#Classification Metics
y_pred = xgb_classifier_CC.predict(credit_card_test)
y_pred_proba = xgb_classifier_CC.predict_proba(credit_card_test)[:, 1]
classification_accuracy(credit_card_test_classification,y_pred,y_pred_proba)

Accuracy: 0.6771
Precision: 0.5152
Recall: 0.5312
F1-score: 0.5231
ROC AUC: 0.6631

Confusion Matrix:
[[48 16]
 [15 17]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.75      0.76        64
         1.0       0.52      0.53      0.52        32

    accuracy                           0.68        96
   macro avg       0.64      0.64      0.64        96
weighted avg       0.68      0.68      0.68        96



##### XG Boost regression model Training
- Train regresson model to predict the revenue.

In [17]:
#Data processing to get data with regression values.

credit_card_train[9] =  credit_card_train_revenue[0]
credit_card_test[9] = credit_card_test_revenue[0]

##Select regrressions >0 
credit_card_train = credit_card_train[credit_card_train[9] > 0]
credit_card_test = credit_card_test[credit_card_test[9] > 0]

#Again split back to train and label
credit_card_train_revenue = credit_card_train[9]
credit_card_test_revenue =credit_card_test[9]
credit_card_train = credit_card_train.drop(columns=[9])
credit_card_test = credit_card_test.drop(columns=[9])

print(credit_card_train.shape)

(205, 9)


- Model training

In [18]:
# Initialize the XGBRegressor
xgb_regressor_CC = xgb.XGBRegressor(
    objective='reg:squarederror',  # Standard regression objective
    learning_rate=0.1,            # Lower learning rate to avoid overfitting
    n_estimators=1000,             # Number of trees, you can increase it with lower learning rate
    max_depth=8,                   # Limit tree depth to avoid too complex models
    min_child_weight=1,            # Minimum sum of instance weight (used for pruning)
    subsample=1,                 # Fraction of samples used for fitting trees
    colsample_bytree=1,          # Fraction of features used for each tree
    gamma=0,                   # Fraction of features used for each tree
    early_stopping_rounds=1000
)

# Train the model
xgb_regressor_CC.fit(credit_card_train, credit_card_train_revenue, eval_set=[(credit_card_test, credit_card_test_revenue)], verbose=False)

- Model training metrics

In [19]:
y_pred = xgb_regressor_CC.predict(credit_card_train)

# Evaluate the model
mse = mean_squared_error(credit_card_train_revenue, y_pred)
r2 = r2_score(credit_card_train_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 659.5908
R²: 0.4068


In [20]:
y_pred = xgb_regressor_CC.predict(credit_card_test)

# Evaluate the model
mse = mean_squared_error(credit_card_test_revenue, y_pred)
r2 = r2_score(credit_card_test_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 1671.6245
R²: -0.0484


- Save model

In [21]:
pickle.dump(xgb_regressor_CC, open('model/xgb_regressor_CC.pkl', "wb"))

### Mutual Fund Model Training

##### XG Boost classifier model Training
- To classify if Consumer Loan can be sold to the Client

In [22]:
#This value will be used to tackle the unbalanced dataset.
scale_pos_weight_val = int ((mutual_fund_train_classification.shape[0]-mutual_fund_train_classification.sum())/ mutual_fund_train_classification.sum())
scale_pos_weight_val

  scale_pos_weight_val = int ((mutual_fund_train_classification.shape[0]-mutual_fund_train_classification.sum())/ mutual_fund_train_classification.sum())


4

- Model training

In [23]:
xgb_classifier_MF = xgb.XGBClassifier(    
objective='binary:logistic',
scale_pos_weight=scale_pos_weight_val,
n_estimators=200,       # Use a high number of trees
max_depth=4,             # Limit tree depth to prevent overfitting
min_child_weight=2,      # Prevent overly specific child nodes
subsample=0.8,           # Use 80% of data for each boosting round
colsample_bytree=0.8,    # Randomly sample features for each tree
learning_rate=0.5,      # Lower learning rate to prevent overfitting
gamma=0,                 # No additional regularization (you can experiment with this)
reg_alpha=0.1,           # L1 regularization (Lasso)
reg_lambda=1.0,          # L2 regularization (Ridge)
early_stopping_rounds=250)
xgb_classifier_MF.fit(mutual_fund_train, mutual_fund_train_classification,eval_set=[(mutual_fund_test, mutual_fund_test_classification)],verbose=False)

In [24]:
#Save Model
pickle.dump(xgb_classifier_MF, open('model/xgb_classifier_MF.pkl', "wb"))

- CLassification model metric

In [25]:
#Classification Metics
y_pred = xgb_classifier_MF.predict(mutual_fund_test)
y_pred_proba = xgb_classifier_MF.predict_proba(mutual_fund_test)[:, 1]
classification_accuracy(mutual_fund_test_classification,y_pred,y_pred_proba)

Accuracy: 0.6250
Precision: 0.3226
Recall: 0.4000
F1-score: 0.3571
ROC AUC: 0.5673

Confusion Matrix:
[[50 21]
 [15 10]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.70      0.74        71
         1.0       0.32      0.40      0.36        25

    accuracy                           0.62        96
   macro avg       0.55      0.55      0.55        96
weighted avg       0.65      0.62      0.64        96



##### XG Boost regression model Training
- Train regresson model to predict the revenue.

In [26]:
#Data processing to get data with regression values.

mutual_fund_train[10] =  mutual_fund_train_revenue[0]
mutual_fund_test[10] = mutual_fund_test_revenue[0]

##Select regrressions >0 
mutual_fund_train = mutual_fund_train[mutual_fund_train[10] > 0]
mutual_fund_test = mutual_fund_test[mutual_fund_test[10] > 0]

#Again split back to train and label
mutual_fund_train_revenue = mutual_fund_train[10]
mutual_fund_test_revenue =mutual_fund_test[10]
mutual_fund_train = mutual_fund_train.drop(columns=[10])
mutual_fund_test = mutual_fund_test.drop(columns=[10])

print(mutual_fund_train.shape)


(168, 10)


- Model training

In [27]:
# Initialize the XGBRegressor
xgb_regressor_MF = xgb.XGBRegressor(
    objective='reg:squarederror',  # Standard regression objective
    learning_rate=0.1,            # Lower learning rate to avoid overfitting
    n_estimators=1000,             # Number of trees, you can increase it with lower learning rate
    max_depth=8,                   # Limit tree depth to avoid too complex models
    min_child_weight=1,            # Minimum sum of instance weight (used for pruning)
    subsample=1,                 # Fraction of samples used for fitting trees
    colsample_bytree=1,          # Fraction of features used for each tree
    gamma=0,                   # Fraction of features used for each tree
    early_stopping_rounds=1000
)

# Train the model
xgb_regressor_MF.fit(mutual_fund_train, mutual_fund_train_revenue, eval_set=[(mutual_fund_test, mutual_fund_test_revenue)], verbose=False)

- Model evaluation metrics

In [28]:
y_pred = xgb_regressor_MF.predict(mutual_fund_train)

# Evaluate the model
mse = mean_squared_error(mutual_fund_train_revenue, y_pred)
r2 = r2_score(mutual_fund_train_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 408.6343
R²: 0.1080


In [29]:
y_pred = xgb_regressor_MF.predict(mutual_fund_test)

# Evaluate the model
mse = mean_squared_error(mutual_fund_test_revenue, y_pred)
r2 = r2_score(mutual_fund_test_revenue, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'R²: {r2:.4f}')

Mean Squared Error: 176.4631
R²: -0.0246


- Save model

In [30]:
pickle.dump(xgb_regressor_MF, open('model/xgb_regressor_MF.pkl', "wb"))