## **Credit Dataset**

In [1]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("DT-Credit.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

      Income  Limit  Rating  Cards  Age  Education  Own Student Married  \
0     14.891   3606     283      2   34         11   No      No     Yes   
1    106.025   6645     483      3   82         15  Yes     Yes     Yes   
2    104.593   7075     514      4   71         11   No      No      No   
3    148.924   9504     681      3   36         11  Yes      No      No   
4     55.882   4897     357      2   68         16   No      No     Yes   
..       ...    ...     ...    ...  ...        ...  ...     ...     ...   
395   12.096   4100     307      3   32         13   No      No     Yes   
396   13.364   3838     296      5   65         17   No      No      No   
397   57.872   4171     321      5   67         12  Yes      No     Yes   
398   37.728   2525     192      1   44         13   No      No     Yes   
399   18.701   5524     415      5   64          7  Yes      No      No   

    Region  Balance  
0    South      333  
1     West      903  
2     West      580  
3     West 

In [2]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data)

Categorical Columns: ['Own', 'Student', 'Married', 'Region']


      Income  Limit  Rating  Cards  Age  Education  Balance  Own_Yes  \
0     14.891   3606     283      2   34         11      333    False   
1    106.025   6645     483      3   82         15      903     True   
2    104.593   7075     514      4   71         11      580    False   
3    148.924   9504     681      3   36         11      964     True   
4     55.882   4897     357      2   68         16      331    False   
..       ...    ...     ...    ...  ...        ...      ...      ...   
395   12.096   4100     307      3   32         13      560    False   
396   13.364   3838     296      5   65         17      480    False   
397   57.872   4171     321      5   67         12      138     True   
398   37.728   2525     192      1   44         13        0    False   
399   18.701   5524     415      5   64          7      966     True   

     Student_Yes  Married_Yes  Region_South  Region_West  
0          Fa

In [11]:
#selecting target variable and features
X = data.drop(['Balance'], axis = 1)
y = data['Balance']

#splitting data for training, validating and testing
X_train = X.iloc[:280]
X_validation = X.iloc[280:340]
X_test = X.iloc[340:]
print(X_validation)
y_train = y.iloc[:280]
y_validation = y.iloc[280:340]
y_test = y.iloc[340:]

      Income  Limit  Rating  Cards  Age  Education  Own_Yes  Student_Yes  \
280   53.401   5319     377      3   35         12     True        False   
281   36.142   1852     183      3   33         13     True        False   
282   63.534   8100     581      2   50         17     True        False   
283   49.927   6396     485      3   75         17     True        False   
284   14.711   2047     167      2   67          6    False        False   
285   18.967   1626     156      2   41         11     True        False   
286   18.036   1552     142      2   48         15     True        False   
287   60.449   3098     272      4   69          8    False        False   
288   16.711   5274     387      3   42         16     True        False   
289   10.852   3907     296      2   30          9    False        False   
290   26.370   3235     268      5   78         11    False        False   
291   24.088   3665     287      4   56         13     True        False   
292   51.532

### **Evaluation Metrics**

In [12]:
def mean_sq_error(y_true, y_pred):
  if len(y_true) != len(y_pred):
    raise ValueError("Lengths of y_true and y_pred must be the same.")

  # Calculate the squared differences between true and predicted values
  squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

  # Calculate the mean of squared errors
  mse = sum(squared_errors) / len(y_true)

  return mse

### **Decision Tree**

In [13]:
#Training the dataset
from sklearn.tree import DecisionTreeRegressor

#Define the Decision Tree Classifier
dt_regressor = DecisionTreeRegressor()

dt = dt_regressor.fit(X_train, y_train)

In [14]:
#Validating the model using validation set
y_pred = dt.predict(X_validation)

mse = mean_sq_error(y_validation, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  17886.0


In [15]:
#Validating
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

# Define the hyperparameter grid for grid search
param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Initialize GridSearchCV with cross-validation (e.g., 5-fold)
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, scoring=scoring, cv=5)

# Fit GridSearchCV on the combined training and validation sets
X_train_new = X.iloc[:2550]
y_train_new = y.iloc[:2550]
grid_search.fit(X_train_new, y_train_new)

# Get the best model on the test set
best_dt_model = grid_search.best_estimator_

print(best_dt_model)

DecisionTreeRegressor(max_depth=10, min_samples_leaf=2)


In [16]:
#Testing best model using test set
y_pred = best_dt_model.predict(X_test)

mse = mean_sq_error(y_test, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  1642.9366240412012


### **XGB Regressor**

In [17]:
#Training the dataset
import xgboost as xgb

#Define the XGBRegressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

xgbr = xgb_regressor.fit(X_train, y_train)

In [18]:
#Validating the model using validation set
y_pred = dt.predict(X_validation)

mse = mean_sq_error(y_validation, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  17886.0


In [19]:
#Validating
#Define the hyperparameter grid for grid searching
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=3,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train_new, y_train_new)

# Get the best XGBClassifier model from GridSearchCV
best_xgb_model = grid_search.best_estimator_

print(best_xgb_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=300, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)


In [20]:
#Testing best model using test set
y_pred = best_xgb_model.predict(X_test)

mse = mean_sq_error(y_test, y_pred)
print('Mean Squared Error: ', mse)

Mean Squared Error:  214.41414806754548
