<a href="https://colab.research.google.com/github/sheunq/Credit-Score/blob/main/credit_score_with_stacking_classifier_ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Problem Statement
You are working as a data scientist in a global finance company. Over the years, the company has collected basic bank details and gathered a lot of credit-related information. The management wants to build an intelligent system to segregate the people into credit score brackets to reduce the manual efforts.
## Task
Given a person’s credit-related information, build a machine learning model that can classify the credit score.

In [12]:
import sys
assert sys.version_info>=(3,7)

import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import SMOTE
import re

from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (
BaggingClassifier,ExtraTreesClassifier,
RandomForestClassifier, StackingClassifier,
HistGradientBoostingClassifier)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix, precision_score,recall_score,f1_score 

import pickle
import warnings
warnings.filterwarnings('ignore')

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Credit score.csv')
data.sample(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
91243,142497,44713,4,Ryan Vlastelicaa,14.0,964343476.0,Engineer,16236.13,1142.010833,7.0,...,Bad,3300.77,24.421816,163.0,Yes,66.560642,27.370787,Low_spent_Small_value_payments,213.932182,Standard
83942,131544,19730,7,Sarah Whitel,37.0,176952803.0,Mechanic,129762.48,10974.54,0.0,...,Good,1073.62,26.04028,397.0,No,0.0,107.385452,High_spent_Large_value_payments,298.951023,Good
85032,133182,3150,1,Gianluca Semeraror,39.0,109270987.0,Writer,17213.61,1350.4675,9.0,...,Bad,2818.55,40.118424,10.0,Yes,70.428792,39.755165,Low_spent_Small_value_payments,172.340863,Standard
99120,154314,19913,1,Sagarikat,38.0,11555907.0,Doctor,47689.47,3787.1225,9.0,...,Bad,4675.57,32.208922,129.0,Yes,194.62752,44.294517,High_spent_Small_value_payments,329.478254,Standard
79756,125266,2719,5,Aditi Shrivastavaz,19.0,601743826.0,Teacher,72690.14,6129.511667,4.0,...,Standard,37.07,38.177604,374.0,Yes,235.229676,105.520054,High_spent_Medium_value_payments,392.866193,Standard
51726,83220,13882,7,Lucy Hornbyx,26.0,140539808.0,Musician,54539.52,4540.96,8.0,...,Bad,4779.68,24.775902,79.0,Yes,189.919502,59.454051,High_spent_Medium_value_payments,229.365482,Poor
34470,57336,18328,7,Prodhane,36.0,265884094.0,Accountant,43287.28,3839.273333,7.0,...,Standard,1393.19,38.760461,369.0,Yes,111.362146,33.13113,High_spent_Large_value_payments,479.434056,Standard
41171,67389,29286,4,George Georgiopoulosb,23.0,63498444.0,Teacher,46422.1,4085.508333,8.0,...,Standard,832.68,38.088849,278.0,No,66.49089,40.434932,Low_spent_Medium_value_payments,387.12886,Standard
19535,34933,24126,8,"OGrady""z",31.0,956171632.0,Architect,29020.27,2456.355833,2.0,...,Good,536.36,24.603357,384.0,No,12.127443,73.008389,Low_spent_Small_value_payments,349.114224,Poor
70925,112019,26704,6,Stephen Greyu,28.0,646894326.0,Scientist,46174.58,3652.881667,8.0,...,Good,1268.49,33.229001,258.0,NM,99.836244,48.926132,Low_spent_Large_value_payments,362.402346,Standard


In [8]:
# Data Decription
data.describe(include='all')

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
count,100000.0,100000.0,100000.0,100000,100000.0,100000.0,100000,100000.0,100000.0,100000.0,...,100000,100000.0,100000.0,100000.0,100000,100000.0,100000.0,100000,100000.0,100000
unique,,,,10128,,,15,,,,...,3,,,,3,,,6,,3
top,,,,Jessicad,,,Lawyer,,,,...,Standard,,,,Yes,,,Low_spent_Small_value_payments,,Standard
freq,,,,48,,,7096,,,,...,45848,,,,52326,,,28616,,53174
mean,80631.5,25982.66664,4.5,,33.31634,500461700.0,,50505.123449,4197.270835,5.36882,...,,1426.220376,32.285173,221.22046,,107.699208,55.101315,,392.697586,
std,43301.486619,14340.543051,2.291299,,10.764812,290826700.0,,38299.422093,3186.432497,2.593314,...,,1155.129026,5.116875,99.680716,,132.267056,39.006932,,201.652719,
min,5634.0,1006.0,1.0,,14.0,81349.0,,7005.93,303.645417,0.0,...,,0.23,20.0,1.0,,0.0,0.0,,0.00776,
25%,43132.75,13664.5,2.75,,24.0,245168600.0,,19342.9725,1626.594167,3.0,...,,566.0725,28.052567,144.0,,29.268886,27.959111,,267.615983,
50%,80631.5,25777.0,4.5,,33.0,500688600.0,,36999.705,3095.905,5.0,...,,1166.155,32.305784,219.0,,66.462304,45.15655,,333.865366,
75%,118130.25,38385.0,6.25,,42.0,756002700.0,,71683.47,5957.715,7.0,...,,1945.9625,36.496663,302.0,,147.392573,71.295797,,463.215683,


In [9]:
# Data Info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [10]:
# Checking for missing values
data.isnull().sum()

ID                          0
Customer_ID                 0
Month                       0
Name                        0
Age                         0
SSN                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64

## Data Preprocessing

In [13]:
# cleaning Outstanding Debt
data['Outstanding_Debt']=data['Outstanding_Debt'].str.extract('(\d+)')
# cleaning Annual income
data['Annual_Income']=data['Annual_Income'].str.extract('(\d+)')
# cleaning changed credit limit
data['Changed_Credit_Limit']=data['Changed_Credit_Limit'].str.extract('(\d+)')
# Cleaning Number of loan
data['Num_of_Loan']=data['Num_of_Loan'].str.extract('(\d+)')
# Cleaning Age
data['Age']=data['Age'].str.extract('(\d+)')
# Cleaning Amount invested monthly
data['Amount_invested_monthly']=(data['Amount_invested_monthly'].astype('str')).str.extract('(\d+)')
# Cleaning Monthly Balance
data['Monthly_Balance']=(data['Monthly_Balance'].astype('str')).str.extract('(\d+)')
# Cleaning Number of Delayed Payment 
data['Num_of_Delayed_Payment']=data['Num_of_Delayed_Payment'].str.extract('(\d+)')
# Filling the None values with the median of Num of Delayed payment through simpleimputer
si=SimpleImputer(strategy='median')
si=si.fit_transform(data[['Num_of_Delayed_Payment']])
data[['Num_of_Delayed_Payment']]=si

AttributeError: ignored

In [None]:
# Credit_History_Age filtering
def History_age(age):
    try : 
        years = int("".join(re.findall('[0-9]',''.join(age.split("and")[0]))))
        month = int("".join(re.findall('[0-9]',''.join(age.split("and")[1]))))
        return years*12 + month
    except :
        return np.nan

In [None]:
data['Credit_History_Age'] = data['Credit_History_Age'].apply(History_age)

In [None]:
# filling empty value with most frequent value
si_type_of_loan=SimpleImputer(strategy='most_frequent')
si_type_of_loan=si_type_of_loan.fit(data[['Type_of_Loan']])
data[['Type_of_Loan']]=si_type_of_loan.transform(data[['Type_of_Loan']])

In [None]:
''' 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
     'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries',
       'Credit_Utilization_Ratio', 'Total_EMI_per_month','Monthly_Balance',
       'Amount_invested_monthly','Credit_History_Age','Changed_Credit_Limit '''

si=SimpleImputer()
si=si.fit(data[['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries',
       'Credit_Utilization_Ratio', 'Total_EMI_per_month','Monthly_Balance','Amount_invested_monthly','Credit_History_Age','Changed_Credit_Limit']])
si=si.transform(data[['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries',
       'Credit_Utilization_Ratio', 'Total_EMI_per_month','Monthly_Balance','Amount_invested_monthly','Credit_History_Age','Changed_Credit_Limit']])
data[['Monthly_Inhand_Salary','Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate','Delay_from_due_date', 'Num_Credit_Inquiries',
       'Credit_Utilization_Ratio','Total_EMI_per_month','Monthly_Balance',
      'Amount_invested_monthly','Credit_History_Age','Changed_Credit_Limit']]=si

In [None]:
data.isnull().sum()

In [None]:
# Deleting columns that are not relevant for the model
del data['ID']
del data['Name']
del data['Customer_ID']
del data['SSN']
del data['Month']

In [None]:
data.isnull().sum()

In [None]:
data.select_dtypes(include='object').head()

## OCCUPATION

In [None]:
print(data.select_dtypes(include='object')['Occupation'].value_counts())
fig=px.bar(data.select_dtypes(include='object')['Occupation'].value_counts())
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

In [None]:
#cleaning Occupation
data['Occupation'].replace('_______','others',inplace=True)

In [None]:
# visualization after cleaning
print(data.select_dtypes(include='object')['Occupation'].value_counts())
fig=px.bar(data.select_dtypes(include='object')['Occupation'].value_counts())
fig.show()

## Credit Mix

In [None]:
print(data.select_dtypes(include='object')['Credit_Mix'].value_counts().sort_values())
fig=px.bar(data.select_dtypes(include='object')['Credit_Mix'].value_counts().sort_values())
fig.show()

In [None]:
# Cleaning Credit Mix
# Good
good=data[['Credit_Mix','Credit_Score']][data['Credit_Score']=='Good']['Credit_Mix'].replace('_','Good')
# Standard
standard=data[['Credit_Mix','Credit_Score']][data['Credit_Score']=='Standard']['Credit_Mix'].replace('_','Standard')
# poor/Bad
bad=data[['Credit_Mix','Credit_Score']][data['Credit_Score']=='Poor']['Credit_Mix'].replace('_','Bad')
g_s_b=pd.concat([good,standard,bad])
g_s_b.sort_index(inplace=True)
data['Credit_Mix']=g_s_b

In [None]:
# Visualizationc After Cleaning Credit Mix
print(data.select_dtypes(include='object')['Credit_Mix'].value_counts().sort_values())
fig=px.bar(data.select_dtypes(include='object')['Credit_Mix'].value_counts().sort_values())
fig.show()

In [None]:
print(data['Payment_of_Min_Amount'].value_counts())
fig=px.bar(data['Payment_of_Min_Amount'].value_counts())
fig.show()

In [None]:
# Cleaning payment of minimum amount
data['Payment_of_Min_Amount'].replace('NM','No',inplace=True)

In [None]:
# Visualization after cleaning
print(data['Payment_of_Min_Amount'].value_counts())
fig=px.bar(data['Payment_of_Min_Amount'].value_counts())
fig.show()

In [None]:
print(data['Payment_Behaviour'].value_counts())
fig=px.bar(data['Payment_Behaviour'].value_counts())
fig.show()

In [None]:
# cleaning payment behaviour 
data['Payment_Behaviour'].replace('!@9#%8','other',inplace=True)

In [None]:
# visualization after cleaning
print(data['Payment_Behaviour'].value_counts())
fig=px.bar(data['Payment_Behaviour'].value_counts())
fig.show()

In [None]:
print(data['Credit_Score'].value_counts())
fig=px.bar(data['Credit_Score'].value_counts())
fig.show()

In [None]:
# Age visualization
fig=px.scatter(data['Age'].value_counts())
fig.show()

In [None]:
# taking care of outliers
lo_age=LocalOutlierFactor()
lo_age=lo_age.fit(data[['Age']])
data['Age']=pd.Series(lo_age.negative_outlier_factor_)

In [None]:
# visualization after taking care of outlier
fig=px.scatter(data['Age'].value_counts())
fig.show()

In [None]:
# Converting object to float
data[['Annual_Income','Num_of_Loan','Changed_Credit_Limit','Outstanding_Debt']]=data[['Annual_Income','Num_of_Loan','Changed_Credit_Limit','Outstanding_Debt']].astype('float64')

In [None]:
data.sample(5)

In [None]:
# Cleaning type of loan
data['Type_of_Loan']=pd.DataFrame(data['Type_of_Loan'].value_counts().head(9).index[:])

In [None]:
data['Type_of_Loan']

In [None]:
si=SimpleImputer(strategy='most_frequent')
data['Type_of_Loan']=si.fit_transform(data[['Type_of_Loan']])

In [None]:
data['Type_of_Loan'].value_counts()

## Splitting the data into features and label

In [None]:
x,y=data.drop('Credit_Score',axis=1),data['Credit_Score']

In [None]:
x.select_dtypes(include=['object']).sample(5)

In [None]:
dummies=pd.get_dummies(x.select_dtypes(include=['object']))
dummies

In [None]:
del x['Occupation']
del x['Type_of_Loan'] 
del x['Credit_Mix']
del x['Payment_of_Min_Amount']
del x['Payment_Behaviour']

In [None]:
x=x.join(dummies)

In [None]:
x.sample(5)

In [None]:
y=y.map({'Good':0,
      'Standard':1,
      'Poor':2})

In [None]:
y.value_counts()

In [None]:
smote=SMOTE()
x_smote,y_smote=smote.fit_resample(x,y)

In [None]:
y_smote.value_counts(normalize=True)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_smote,y_smote,test_size=0.2,random_state=42)

In [None]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(x_train)

In [None]:
def credit_function(y_pred,y_test):
    accuracy=accuracy_score(y_pred,y_test)
    recall=recall_score(y_test,y_pred,average='macro')
    precision=precision_score(y_pred,y_test,average='macro')
    f1=f1_score(y_pred,y_test,average='macro')
    
    print(f'Accuracy Score: {accuracy*100:.2f}%')
    print(f'Recall Score: {recall*100:.2f}%')
    print(f'Precision Score: {precision*100:.2f}%')
    print(f'f1 Score Score: {f1*100:.2f}%')    

In [None]:
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

model = StackingClassifier([
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
    ], n_jobs=-1)
    
model=model.fit(x_train, y_train)
y_pred=model.predict(x_test)

In [None]:
credit_function(y_pred,y_test)

In [None]:
print(f'Classification report {classification_report(y_pred,y_test)}')

## Save Model

In [None]:
pickle.dump(model,open('stackingclassifier.pkl','wb'))