# Credit Score Classification

# Problem Statement:

To develop a predictive model for classifying individuals into credit score categories based on their financial data and other relevant factors. Accurate credit scoring is crucial for the financial institutions to assess the reliability of applicants and manage risk effectively as it leads to less default.

In [1]:
# Importing Related Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Libraries for KNN IMPUTER 
#from sklearn.impute import KNNImputer
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler

In [2]:
# Reading the file
df = pd.read_csv('train.csv')

In [3]:
print('Rows:',df.shape[0])
print('Columns:',df.shape[1])

Rows: 100000
Columns: 28


In [4]:
# Displaying First 5 Rows of the data
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [5]:
# Checking for the null columns
df.isnull().sum()

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64

In [6]:
# Five point summary of the Numerical columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Monthly_Inhand_Salary,84998.0,4194.17085,3183.686167,303.645417,1625.568229,3093.745,5957.448333,15204.633333
Num_Bank_Accounts,100000.0,17.09128,117.404834,-1.0,3.0,6.0,7.0,1798.0
Num_Credit_Card,100000.0,22.47443,129.05741,0.0,4.0,5.0,7.0,1499.0
Interest_Rate,100000.0,72.46604,466.422621,1.0,8.0,13.0,20.0,5797.0
Delay_from_due_date,100000.0,21.06878,14.860104,-5.0,10.0,18.0,28.0,67.0
Num_Credit_Inquiries,98035.0,27.754251,193.177339,0.0,3.0,6.0,9.0,2597.0
Credit_Utilization_Ratio,100000.0,32.285173,5.116875,20.0,28.052567,32.305784,36.496663,50.0
Total_EMI_per_month,100000.0,1403.118217,8306.04127,0.0,30.30666,69.249473,161.224249,82331.0


In [7]:
# Summary of Categorical Columns
df.describe(include = object).T

Unnamed: 0,count,unique,top,freq
ID,100000,100000,0x1602,1
Customer_ID,100000,12500,CUS_0xd40,8
Month,100000,8,January,12500
Name,90015,10139,Langep,44
Age,100000,1788,38,2833
SSN,100000,12501,#F%$D@*&8,5572
Occupation,100000,16,_______,7062
Annual_Income,100000,18940,36585.12,16
Num_of_Loan,100000,434,3,14386
Type_of_Loan,88592,6260,Not Specified,1408


## Data cleaning

In [8]:
# Checking for the unique Customer_ID
df.Customer_ID.nunique()

12500

#### Type_of_Loan

In [9]:
# splitting the sub categories 
len(df.Type_of_Loan[0].split(','))

4

In [10]:
df.Num_of_Loan =df.Num_of_Loan.str.replace('_','')

In [11]:
df.Num_of_Loan = df.Num_of_Loan.replace('-','')

In [12]:
df.Num_of_Loan.isnull().sum()

0

In [13]:
df.Num_of_Loan.dtype

dtype('O')

In [14]:
df[df.Num_of_Loan == '-100']

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
31,0x162d,CUS_0xb891,August,Jasond,55,072-31-6145,Entrepreneur,30689.89,2612.490833,2,...,Good,632.46,27.332515,17 Years and 10 Months,No,16.415452,125.61725053231268,High_spent_Small_value_payments,379.21638114119577,Standard
34,0x1634,CUS_0x1cdb,March,Deepaa,21,615-06-7821,Developer,35547.71,2853.309167,7,...,Standard,943.86,23.462303,30 Years and 10 Months,Yes,0.000000,173.13865100158367,Low_spent_Medium_value_payments,392.1922656650829,Standard
39,0x1639,CUS_0x1cdb,August,Deepaa,21,615-06-7821,Developer,35547.71,2853.309167,7,...,Standard,943.86,25.862922,31 Years and 3 Months,Yes,0.000000,181.33090096186916,High_spent_Small_value_payments,364.00001570479753,Standard
53,0x164f,CUS_0x284a,June,Nadiaq,34,411-51-0676,Lawyer,131313.4,,0,...,Good,352.16,41.980170,31 Years and 0 Months,No,911.220179,86.56638801207531,High_spent_Large_value_payments,1140.0673399198365,Standard
61,0x165b,CUS_0x5407,June,Annk,30,500-92-6408,Media_Manager,34081.38,2611.115000,8,...,Standard,1704.18,31.170872,15 Years and 0 Months,Yes,70.478333,,!@9#%8,410.6743660782873,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99877,0x25f37,CUS_0x3855,June,Xolai,6476,963-55-7106,Scientist,118677.54,9963.795000,4,...,Good,995.47,38.556181,30 Years and 4 Months,No,237.106366,231.2765296325449,High_spent_Large_value_payments,767.996604,Good
99901,0x25f5b,CUS_0x4986,June,Charles Abbotta,34,971-61-8388,_______,41329.56,3421.130000,2,...,_,1245.01,32.846273,23 Years and 8 Months,No,20.553577,272.7925038930821,Low_spent_Large_value_payments,318.766919,Good
99902,0x25f5c,CUS_0x4986,July,Charles Abbotta,34,971-61-8388,Entrepreneur,41329.56,3421.130000,2,...,Good,1245.01,39.011550,23 Years and 9 Months,No,20.553577,462.1176338205343,Low_spent_Small_value_payments,149.44179,Good
99969,0x25fc3,CUS_0xf16,February,,45,868-70-2218,Media_Manager,16680.35,1528.029167,1,...,Good,897.16,34.085971,21 Years and 2 Months,No,41.113561,104.64623687765652,High_spent_Small_value_payments,267.043119,Good


In [15]:
df.Num_of_Loan.unique()

#num of loan has many absurd values so we will not use it and remove it 

array(['4', '1', '3', '967', '-100', '0', '2', '7', '5', '6', '8', '9',
       '1464', '622', '352', '472', '1017', '945', '146', '563', '341',
       '444', '720', '1485', '49', '737', '1106', '466', '728', '313',
       '843', '597', '617', '119', '663', '640', '92', '1019', '501',
       '1302', '39', '716', '848', '931', '1214', '186', '424', '1001',
       '1110', '1152', '457', '1433', '1187', '52', '1480', '1047',
       '1035', '1347', '33', '193', '699', '329', '1451', '484', '132',
       '649', '995', '545', '684', '1135', '1094', '1204', '654', '58',
       '348', '614', '1363', '323', '1406', '1348', '430', '153', '1461',
       '905', '1312', '1424', '1154', '95', '1353', '1228', '819', '1006',
       '795', '359', '1209', '590', '696', '1185', '1465', '911', '1181',
       '70', '816', '1369', '143', '1416', '455', '55', '1096', '1474',
       '420', '1131', '904', '89', '1259', '527', '1241', '449', '983',
       '418', '319', '23', '238', '638', '138', '235', '280', '1

In [16]:
df[df.Type_of_Loan.notnull()] 

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.822620,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.944960,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736786,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0x25fe9,CUS_0x942c,April,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,...,_,502.38,34.663572,31 Years and 6 Months,No,35.104023,60.97133255718485,High_spent_Large_value_payments,479.866228,Poor
99996,0x25fea,CUS_0x942c,May,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,...,_,502.38,40.565631,31 Years and 7 Months,No,35.104023,54.18595028760385,High_spent_Medium_value_payments,496.65161,Poor
99997,0x25feb,CUS_0x942c,June,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,41.255522,31 Years and 8 Months,No,35.104023,24.02847744864441,High_spent_Large_value_payments,516.809083,Poor
99998,0x25fec,CUS_0x942c,July,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,...,Good,502.38,33.638208,31 Years and 9 Months,No,35.104023,251.67258219721603,Low_spent_Large_value_payments,319.164979,Standard


In [17]:
#checking for values in number of loan, if it also wierd and absurd or its normal

b = []
for i in range(df.shape[0]):
    if pd.isna(df.iloc[i,13]) == True:
        b.append(np.nan)
    else:
        b.append(len(df.Type_of_Loan[i].split(',')))
        
#now we will delete num of loan and keep this

In [18]:
b = pd.Series(b)

In [19]:
df['Num_of_loan2'] = b

In [20]:
#df['Num_of_loan2'] = df['Num_of_loan2'].astype(int)
# after fixing null values we will covert to int

In [21]:
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Num_of_loan2
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good,4.0
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good,4.0
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good,4.0
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good,4.0
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good,4.0


In [22]:
df.Type_of_Loan.isnull().sum()

11408

In [23]:
df.Num_of_loan2.value_counts()

3.0    15752
2.0    15712
4.0    15456
1.0    11128
6.0     8144
7.0     7680
5.0     7528
9.0     3856
8.0     3336
Name: Num_of_loan2, dtype: int64

In [24]:
df.loc[(df.Num_of_loan2.isnull()) & (df.Num_of_Loan == '0'), 'Num_of_loan2'] = 0

In [25]:
df.loc[(df.Num_of_loan2.isnull())]

#we can remove these 478 remaining values as they have no pattern or nothong

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Num_of_loan2
34,0x1634,CUS_0x1cdb,March,Deepaa,21,615-06-7821,Developer,35547.71,2853.309167,7,...,943.86,23.462303,30 Years and 10 Months,Yes,0.000000,173.13865100158367,Low_spent_Medium_value_payments,392.1922656650829,Standard,
39,0x1639,CUS_0x1cdb,August,Deepaa,21,615-06-7821,Developer,35547.71,2853.309167,7,...,943.86,25.862922,31 Years and 3 Months,Yes,0.000000,181.33090096186916,High_spent_Small_value_payments,364.00001570479753,Standard,
180,0x170e,CUS_0xac86,May,Nickb,20,028-16-4402,Entrepreneur,106733.13_,8873.427500,4,...,76.23,28.820554,33 Years and 1 Months,No,0.000000,345.60913541068396,High_spent_Medium_value_payments,791.733614589316,Good,
327,0x17e9,CUS_0xb5ff,August,Nateg,33,060-81-1328,Media_Manager,12909.895,853.824583,5,...,602.5,23.579851,17 Years and 8 Months,No,0.000000,60.052634766462795,Low_spent_Medium_value_payments,305.32982356687063,Standard,
355,0x1815,CUS_0xaedb,April,Olivia Oranr,19,272-47-1135,Musician,85554.03,7185.502500,4,...,1095.73,31.360245,19 Years and 10 Months,No,0.000000,303.6646866498075,Low_spent_Large_value_payments,684.8855633501927,Good,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99611,0x25da9,CUS_0x2829,April,Alistair Barre,31,172-85-1573,Journalist,64901.37,5298.447500,3,...,442.06,29.639741,31 Years and 1 Months,Yes,0.000000,162.17115972608323,High_spent_Medium_value_payments,617.67359,Standard,
99612,0x25daa,CUS_0x2829,May,Alistair Barre,32,172-85-1573,Journalist,64901.37,5298.447500,3,...,442.06,37.699808,,Yes,0.000000,97.41318994087194,High_spent_Large_value_payments,672.43156,Standard,
99630,0x25dc4,CUS_0x738b,July,Leahh,25,769-44-1950,Accountant,123828.52,9317.324054,4,...,1288.43,39.099750,32 Years and 7 Months,NM,999.719279,,Low_spent_Medium_value_payments,530.878992,Good,
99663,0x25df5,CUS_0x30fc,August,Zoran Radosavljevicl,28,209-55-1275,Manager,90781.53,7505.127500,7,...,494.51,36.809951,,Yes,0.000000,688.2568750926885,Low_spent_Medium_value_payments,342.255875,Standard,


In [26]:
#df.drop(df.loc[(df.Num_of_loan2.isnull())],axis = 0)
#df.shape
#df.Num_of_loan2.dropna()
#df['Num_of_loan2'].dropna(inplace=True,)
#df.drop(df.Num_of_loan2.isnull(), axis = 0).reset_index(drop = True)
df.dropna(subset = 'Num_of_loan2',inplace= True)

TypeError: Index(...) must be called with a collection of some kind, 'Num_of_loan2' was passed

In [None]:
df.Num_of_loan2.isnull().sum()

In [None]:
df.isnull().sum()
# we will remove name so no need for imputing null value for name
# type of loan also remove so no need to imoute

In [None]:
extracted_Data = df['Credit_History_Age'].str.extract(r'(\d+) Years and (\d+) Months')


In [None]:
df['Credit_History_Age'] = extracted_Data[0]+'.'+extracted_Data[1]
#converted extracted date to year.months
#now we can either round off or remove month

In [None]:
#df[df.Name == 'Aaron Maashoh']

In [None]:
df['Age'] = df.Age.str.replace('-','')
#cleaning age

In [None]:
df['Age'] = df.Age.str.replace('_','')
#now we have cleaned age

In [None]:
#as age has no null value we will try to change it to int
df['Age'] = df.Age.astype(int)

In [None]:
# as age has anomalous data we will try to check for age as value are absurd with ranging uptio 1000
#-----------for every name check other entries with same name-----------
df.Age


In [None]:
df[df.Name == 'Aaron Maashoh']

In [None]:
df.Age[df.Age < 100].unique()

#now we will make these values null and try to impute them

In [None]:
###do in classssssss-----------------------------------
df[df.Name == 'Costas Pitasw']

In [None]:
#column_name = 'Age' group_by = 'Customer_ID' user_friendly_name = 'Age' 
#Get Details
#get_column_details(df,'Age')
#Cleaning clean_numerical_field(df_train,group_by,column_name,strip='_',datatype='int')
#Plot Graph plot_displot(df_train,column_name,user_friendly_name,bins=40)

In [None]:
#df['Age'][df['Age'] > 60] = np.nan

In [None]:
##cleaning occupation

In [None]:
df['Occupation'].value_counts()

In [None]:
df[df.Occupation == '_______']

In [None]:
df[df.Name == 'Langep']

In [None]:
df[df.Customer_ID == 'CUS_0xb11c']

In [None]:
##create function
#customner ID of person who has ---- in occupation
#   check for other occupation of that same person
#   if not ---- then replace occupation else impute null

# customner ID of person who has --- or null or abnormal value
# them check for same id where not null or not ---- and fill value form there
import warnings
warnings.filterwarnings('ignore')

In [None]:
df[df.Occupation == '_______'].shape

In [None]:

job=""
for i in df.Customer_ID[df.Occupation == '_______']:
    job = df.Occupation[(df.Customer_ID ==i) & (df.Occupation != '_______')].unique()
    df.Occupation[df.Customer_ID == i] = job 
    #rint(i)

In [None]:
#df.Occupation[df.Customer_ID == 'CUS_0x6649']
df.Occupation[(df.Customer_ID == 'CUS_0xbffe') & (df.Occupation != '_______')]

In [None]:
job = df.Occupation[(df.Customer_ID == 'CUS_0x6649') & (df.Occupation != '_______')]
job
df.Occupation[df.Customer_ID == 'CUS_0x6649']

In [None]:
# Doing Treatment for the age Column

In [None]:
df.Age[df['Age']>90] = np.nan

In [None]:
df['Age'].isnull().sum()

In [None]:
df.Age[df.Age.isnull()]

In [None]:
age=""
Age1=""
for i in df.Customer_ID[df.Age.isnull()]:
    age = df.Age[(df.Customer_ID ==i) & (df.Age.notnull())]
    Age1 = list(age)
    df.Age[df.Customer_ID == i] = Age1[0]

In [None]:
df.loc[99972]

In [None]:
# successfully Treated the Null Valuedf.Age.isnull()]
df.Age.isnull().sum()

In [None]:
Credit_Age = df[df.Credit_History_Age.isnull()]

In [None]:
df.loc[df.Name == 'Langep', ['Customer_ID','Month', 'Credit_History_Age']]

In [None]:
df.loc[df.Name == 'Aaron Maashoh', ['Customer_ID','Month', 'Credit_History_Age']]

In [None]:
## Treating Annual Income
df.Annual_Income.value_counts()

In [None]:
df.Annual_Income = df.Annual_Income.str.replace('_','')

In [None]:
df.Annual_Income = df.Annual_Income.str.replace('-','')

In [None]:
df.Annual_Income = df.Annual_Income.astype(float)

In [None]:
df.Annual_Income.dtype

In [None]:
df.isnull().sum()

In [None]:
df.Num_of_Delayed_Payment