In [60]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [61]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = 'Cleaned/train.csv'
train_df = pd.read_csv(file_path)
train_df.head(10)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Payment_Behaviour,Monthly_Balance,Credit_Score
0,3392,1,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,26.82262,265.0,No,49.574949,High_spent_Small_value_payments,312.494089,Good
1,3392,2,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,31.94496,266.0,No,49.574949,Low_spent_Large_value_payments,284.629162,Good
2,3392,3,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,28.609352,267.0,No,49.574949,Low_spent_Medium_value_payments,331.209863,Good
3,3392,4,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,31.377862,268.0,No,49.574949,Low_spent_Small_value_payments,223.45131,Good
4,3392,5,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,24.797347,269.0,No,49.574949,High_spent_Medium_value_payments,341.489231,Good
5,3392,6,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,27.262259,270.0,No,49.574949,High_spent_Medium_value_payments,340.479212,Good
6,3392,7,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,22.537593,271.0,No,49.574949,Low_spent_Small_value_payments,244.565317,Good
7,3392,8,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,4.0,Good,809.98,23.933795,272.0,No,49.574949,High_spent_Medium_value_payments,358.124168,Standard
8,8625,1,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,2.0,Good,605.03,24.464031,319.0,No,18.816215,Low_spent_Small_value_payments,470.690627,Standard
9,8625,2,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,2.0,Good,605.03,38.550848,320.0,No,18.816215,High_spent_Large_value_payments,484.591214,Good


In [79]:
train_df.tail()

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Payment_Behaviour,Monthly_Balance,Credit_Score
99995,37932,4,25.0,Mechanic,39628.99,3359.415833,6.0,7.0,2.0,"auto loan,student loan",...,3.0,Good,502.38,34.663572,378.0,No,35.104023,High_spent_Large_value_payments,479.866228,Poor
99996,37932,5,25.0,Mechanic,39628.99,3359.415833,6.0,7.0,2.0,"auto loan,student loan",...,3.0,Good,502.38,40.565631,379.0,No,35.104023,High_spent_Medium_value_payments,496.65161,Poor
99997,37932,6,25.0,Mechanic,39628.99,3359.415833,6.0,7.0,2.0,"auto loan,student loan",...,3.0,Good,502.38,41.255522,380.0,No,35.104023,High_spent_Large_value_payments,516.809083,Poor
99998,37932,7,25.0,Mechanic,39628.99,3359.415833,6.0,7.0,2.0,"auto loan,student loan",...,3.0,Good,502.38,33.638208,381.0,No,35.104023,Low_spent_Large_value_payments,319.164979,Standard
99999,37932,8,25.0,Mechanic,39628.99,3359.415833,6.0,7.0,2.0,"auto loan,student loan",...,3.0,Good,502.38,34.192463,382.0,No,35.104023,Low_spent_Small_value_payments,393.673696,Poor


In [62]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = train_df["Credit_Score"]

# Separate the X variable, the features
X = train_df.drop(columns=["Credit_Score"])


In [63]:
# Identify numerical and categorical columns
numerical_columns = X.select_dtypes(include=['number']).columns
categorical_columns = X.select_dtypes(exclude=['number']).columns


In [64]:
# Create transformers for numerical and categorical columns
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

In [65]:
# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])


In [66]:
# Create a pipeline with the column transformer
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [67]:
# Apply the preprocessor to features
X_transformed = pipeline.fit_transform(X)

In [68]:
# Review the y variable Series
y.head()

0    Good
1    Good
2    Good
3    Good
4    Good
Name: Credit_Score, dtype: object

In [69]:
# Review the X variable DataFrame
## DO Not USE!
X.head()

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Payment_Behaviour,Monthly_Balance
0,3392,1,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,26.82262,265.0,No,49.574949,High_spent_Small_value_payments,312.494089
1,3392,2,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,31.94496,266.0,No,49.574949,Low_spent_Large_value_payments,284.629162
2,3392,3,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,28.609352,267.0,No,49.574949,Low_spent_Medium_value_payments,331.209863
3,3392,4,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,6.27,4.0,Good,809.98,31.377862,268.0,No,49.574949,Low_spent_Small_value_payments,223.45131
4,3392,5,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,24.797347,269.0,No,49.574949,High_spent_Medium_value_payments,341.489231


In [70]:
# Convert the sparse matrix to a dense NumPy array
X_transformed_array = X_transformed.toarray()

# Convert the array to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed_array, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

# Display the first few rows of the transformed DataFrame
X_transformed_df.head()

Unnamed: 0,num__Customer_ID,num__Month,num__Age,num__Annual_Income,num__Monthly_Inhand_Salary,num__Num_Credit_Card,num__Interest_Rate,num__Num_of_Loan,num__Num_of_Delayed_Payment,num__Changed_Credit_Limit,...,cat__Payment_of_Min_Amount_NM,cat__Payment_of_Min_Amount_No,cat__Payment_of_Min_Amount_Yes,cat__Payment_Behaviour_High_spent_Large_value_payments,cat__Payment_Behaviour_High_spent_Medium_value_payments,cat__Payment_Behaviour_High_spent_Small_value_payments,cat__Payment_Behaviour_Low_spent_Large_value_payments,cat__Payment_Behaviour_Low_spent_Medium_value_payments,cat__Payment_Behaviour_Low_spent_Small_value_payments,cat__Payment_Behaviour_nan
0,-1.575309,-1.527525,-0.958344,-0.819625,-0.744544,-0.741899,-1.319266,0.190946,-1.005915,0.12099,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-1.575309,-1.091089,-0.958344,-0.819625,-0.744544,-0.741899,-1.319266,0.190946,-1.484859,0.12099,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.575309,-0.654654,-0.958344,-0.819625,-0.744544,-0.741899,-1.319266,0.190946,-1.005915,0.12099,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.575309,-0.218218,-0.958344,-0.819625,-0.744544,-0.741899,-1.319266,0.190946,-1.484859,-0.635503,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.575309,0.218218,-0.958344,-0.819625,-0.744544,-0.741899,-1.319266,0.190946,-1.484859,0.12099,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [71]:
# Check the balance of our target values
label_counts = y.value_counts()
label_counts

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

In [72]:
# Load Separate CSV file for testing 

In [73]:
# pause using this data set
#file_path = 'Cleaned/test.csv'
#test_df = pd.read_csv(file_path)
#test_df.head(10)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Payment_Behaviour,Monthly_Balance
0,3392,9,23.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,35.030402,273.0,No,49.574949,Low_spent_Small_value_payments,186.266702
1,3392,10,24.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,13.27,4.0,Good,809.98,33.053114,274.0,No,49.574949,High_spent_Medium_value_payments,361.444004
2,3392,11,24.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,12.27,4.0,Good,809.98,33.811894,275.0,No,49.574949,Low_spent_Medium_value_payments,264.675446
3,3392,12,24.0,Scientist,19114.12,1824.843333,4.0,3.0,4.0,"auto loan,credit-builder loan,personal loan,ho...",...,11.27,4.0,Good,809.98,32.430559,276.0,No,49.574949,High_spent_Medium_value_payments,343.826873
4,8625,9,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,5.42,5.0,Good,605.03,25.926822,327.0,No,18.816215,High_spent_Large_value_payments,485.298434
5,8625,10,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,5.42,5.0,Good,605.03,30.1166,328.0,No,18.816215,Low_spent_Large_value_payments,303.355083
6,8625,11,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,5.42,5.0,Good,605.03,30.996424,329.0,No,18.816215,High_spent_Large_value_payments,452.302307
7,8625,12,28.0,Teacher,34847.84,3037.986667,4.0,6.0,1.0,credit-builder loan,...,7.42,5.0,Good,605.03,33.875167,330.0,No,18.816215,High_spent_Large_value_payments,421.447964
8,11708,9,35.0,Engineer,143162.64,12187.22,5.0,8.0,3.0,"auto loan,auto loan,not specified",...,7.1,3.0,Good,1303.01,35.229707,221.0,No,246.992319,Low_spent_Medium_value_payments,854.226027
9,11708,10,35.0,Engineer,143162.64,12187.22,5.0,8.0,3.0,"auto loan,auto loan,not specified",...,2.1,3.0,Good,1303.01,35.685836,222.0,No,246.992319,Low_spent_Large_value_payments,788.11455


In [74]:
# Separate the y variable, the labels
#y_test = test_df["Credit_Score"]

# Separate the X variable, the features
#X_test = test_df.drop(columns=["Credit_Score"])

KeyError: 'Credit_Score'

In [85]:
#y_temp = y[0:10000]

In [86]:
# Test 

#X_transformed_df_temp = X_transformed_df[0:10000]

In [89]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, random_state=1)

In [90]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(random_state=1, max_iter=1000)

# Fit the model using training data
lr_model.fit(X_train, y_train)

In [91]:
# Make a prediction using the testing data
y_prediction = lr_model.predict(X_test)

In [92]:
# Print the balanced_accuracy score of the model
balanced_acc = balanced_accuracy_score(y_test, y_prediction)
print(balanced_acc)

0.7216071212417056


In [93]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_prediction)

array([[ 2984,    69,  1414],
       [  482,  5162,  1599],
       [ 1458,  1411, 10421]])

In [94]:
# Print the classification report for the model
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

        Good       0.61      0.67      0.64      4467
        Poor       0.78      0.71      0.74      7243
    Standard       0.78      0.78      0.78     13290

    accuracy                           0.74     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.75      0.74      0.74     25000



In [95]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)