In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('final_df.csv')
df.shape

(100000, 29)

### Dropping columns that are not requried for training

In [3]:
df.drop(columns=['Unnamed: 0', 'ID', 'Customer_ID', 'Name', 'SSN'], inplace=True, axis=1)



In [4]:
X = df.drop(columns=['Credit Score'])
y = df['Credit Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
X_train.shape, y_train.shape

((80000, 23), (80000,))

In [6]:
X_test.shape, y_test.shape

((20000, 23), (20000,))

In [7]:
# Define your XGBRegressor model
model_rf = RandomForestRegressor()

# Define columns for one-hot encoding
cat_col = X_train.select_dtypes(include='object').columns

# Define columns for standardization
num_col = X_train.select_dtypes('number').columns.tolist()

# Define the pipeline for numerical features
num_pipe = Pipeline(steps=[
    ('scale', StandardScaler())
])

# Define the pipeline for categorical features
cat_pipe = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

# Combine the transformers in a ColumnTransformer
process = ColumnTransformer(transformers=[
    ('num_tran', num_pipe, num_col),
    ('cat_tran', cat_pipe, cat_col)
], remainder='passthrough')

# Create the final pipeline
final_rf = Pipeline(steps=[
    ('process', process),
    ('model', model_rf)
])

# Fit the final model
final_rf.fit(X_train, y_train)

In [8]:
y_pred_rf = final_rf.predict(X_test)

In [9]:
print(f"mean_absolute_error : {mean_absolute_error(y_test, y_pred_rf):.2f}")
print(f"mean_squared_error : {mean_squared_error(y_test, y_pred_rf):.2f}")
print(f"r2_score : {r2_score(y_test, y_pred_rf):.2f}")

mean_absolute_error : 6.31
mean_squared_error : 79.31
r2_score : 0.99


In [10]:
# Define the XGBRegressor model
model_xgb = XGBRegressor(verbose=1)

# Define columns for one-hot encoding
cat_col = X_train.select_dtypes('object').columns.tolist()

# Define columns for standardization
num_col = X_train.select_dtypes('number').columns.tolist()

# Define the pipeline for numerical features
num_pipe = Pipeline(steps=[
    ('scale', StandardScaler())
])

# Define the pipeline for categorical features
cat_pipe = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

# Combine the transformers in a ColumnTransformer
process = ColumnTransformer(transformers=[
    ('num_tran', num_pipe, num_col),
    ('cat_tran', cat_pipe, cat_col)
], remainder='passthrough')

# Create the final pipeline
final_xgb = Pipeline(steps=[
    ('process', process),
    ('model', model_xgb)
])

# Preprocess data to check if it works
try:
    X_processed = process.fit_transform(X_train)
    print("Preprocessing successful!")
except Exception as e:
    print(f"Error during preprocessing: {e}")

# Fit the final model
try:
    final_xgb.fit(X_train, y_train)
except Exception as e:
    print(f"Error during model fitting: {e}")


Preprocessing successful!
Parameters: { "verbose" } are not used.



In [11]:
y_pred_xgb = final_xgb.predict(X_test)
print(f"mean_absolute_error : {mean_absolute_error(y_test, y_pred_xgb):.2f}")
print(f"mean_squared_error : {mean_squared_error(y_test, y_pred_xgb):.2f}")
print(f"r2_score : {r2_score(y_test, y_pred_xgb):.2f}")

mean_absolute_error : 5.16
mean_squared_error : 55.37
r2_score : 0.99


### With this MSE, MAE and r2 score

1. The model is highly accurate, with an R² of 0.99 indicating it explains almost all of the variance in the data.
2. The average prediction error is 5.16 units (MAE), and while larger errors are slightly penalized (MSE = 55.37), they are still relatively small given the R² score.