In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [2]:
credit = pd.read_csv('Datasets/train.csv')

In [3]:
credit.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [4]:
credit.drop(columns = ['ID','Customer_ID','Name','SSN'], inplace = True)

In [4]:
def explore_data(df):
    print("########## Shape ##########")
    print(df.shape)
    print("########## IsNa ##########")
    print(df.isna().sum())
    print("########## Types ##########")
    print(df.dtypes)
    print("########## Describe of Numeric Columns ##########")
    print(df.describe().T)
    print("########## Info ###########")
    print(df.info())

In [5]:
explore_data(credit)

########## Shape ##########
(100000, 28)
########## IsNa ##########
ID                          0
Customer_ID                 0
Month                       0
Name                        0
Age                         0
SSN                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64
########## Types ##########
ID                            int64
Customer_ID    

In [6]:
def grab_col_names(df, cat_th=10, num_th=20):
    cat = [col for col in df.columns if df[col].dtypes == 'O']
    num_but_cat = [col for col in df.columns if df[col].dtypes != 'O'
                  and df[col].nunique() < cat_th]
    cat_but_num = [col for col in df.columns if df[col].dtypes == 'O'
                  and df[col].nunique() > num_th]
    cat = cat + num_but_cat
    cat = [col for col in cat if col not in cat_but_num]
    num = [col for col in df.columns if df[col].dtypes != 'O']
    num = [col for col in num if col not in num_but_cat]
    return cat, cat_but_num, num

In [7]:
cat_cols, cat_but_num, num_cols = grab_col_names(credit)

In [8]:
cat_cols

['Occupation',
 'Credit_Mix',
 'Payment_of_Min_Amount',
 'Payment_Behaviour',
 'Credit_Score',
 'Month']

In [9]:
cat_but_num

['Name', 'Type_of_Loan']

In [11]:
credit = credit.drop(columns=cat_but_num, axis = 1)

In [10]:
num_cols

['ID',
 'Customer_ID',
 'Age',
 'SSN',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [13]:
credit = credit.drop(columns=['ID', 'Customer_ID','SSN'])

In [14]:
credit.groupby('Occupation').size()

Occupation
Accountant       6744
Architect        6824
Developer        6720
Doctor           6568
Engineer         6864
Entrepreneur     6648
Journalist       6536
Lawyer           7096
Manager          6432
Mechanic         6776
Media_Manager    6720
Musician         6352
Scientist        6744
Teacher          6672
Writer           6304
dtype: int64

In [15]:
label_encoder = LabelEncoder()
category1 = credit['Credit_Mix']
numeric_values = label_encoder.fit_transform(category1)

In [16]:
credit['Credit_Mix'] = numeric_values

In [17]:
label_encoder = LabelEncoder()
category2 = credit['Payment_of_Min_Amount']
numeric_values = label_encoder.fit_transform(category2)

In [18]:
credit['Payment_of_Min_Amount'] = numeric_values

In [19]:
label_encoder = LabelEncoder()
category2 = credit['Payment_Behaviour']
numeric_values = label_encoder.fit_transform(category2)

In [20]:
credit['Payment_Behaviour'] = numeric_values

In [21]:
label_encoder = LabelEncoder()
category3 = credit['Credit_Score']
numeric_values = label_encoder.fit_transform(category3)

In [22]:
credit['Credit_Score'] = numeric_values

In [23]:
credit

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1,809.98,26.822620,265.0,1,49.574949,21.465380,2,312.494089,0
1,2,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1,809.98,31.944960,266.0,1,49.574949,21.465380,3,284.629162,0
2,3,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1,809.98,28.609352,267.0,1,49.574949,21.465380,4,331.209863,0
3,4,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,...,1,809.98,31.377862,268.0,1,49.574949,21.465380,5,223.451310,0
4,5,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,...,1,809.98,24.797347,269.0,1,49.574949,21.465380,1,341.489231,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,23.0,...,1,502.38,34.663572,378.0,1,35.104023,24.028477,0,479.866228,1
99996,5,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,18.0,...,1,502.38,40.565631,379.0,1,35.104023,24.028477,1,496.651610,1
99997,6,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,27.0,...,1,502.38,41.255522,380.0,1,35.104023,24.028477,0,516.809083,1
99998,7,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,20.0,...,1,502.38,33.638208,381.0,1,35.104023,24.028477,3,319.164979,2


In [24]:
credit.drop('Occupation', axis = 1, inplace = True)

In [25]:
credit

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,1,809.98,26.822620,265.0,1,49.574949,21.465380,2,312.494089,0
1,2,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,4.0,...,1,809.98,31.944960,266.0,1,49.574949,21.465380,3,284.629162,0
2,3,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,1,809.98,28.609352,267.0,1,49.574949,21.465380,4,331.209863,0
3,4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,4.0,...,1,809.98,31.377862,268.0,1,49.574949,21.465380,5,223.451310,0
4,5,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,4.0,...,1,809.98,24.797347,269.0,1,49.574949,21.465380,1,341.489231,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,23.0,7.0,...,1,502.38,34.663572,378.0,1,35.104023,24.028477,0,479.866228,1
99996,5,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,18.0,7.0,...,1,502.38,40.565631,379.0,1,35.104023,24.028477,1,496.651610,1
99997,6,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,27.0,6.0,...,1,502.38,41.255522,380.0,1,35.104023,24.028477,0,516.809083,1
99998,7,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,20.0,6.0,...,1,502.38,33.638208,381.0,1,35.104023,24.028477,3,319.164979,2


In [26]:
y = credit['Credit_Score'].values

In [27]:
y

array([0, 0, 0, ..., 1, 2, 1])

In [28]:
X = credit.drop(columns = ['Credit_Score'], axis = 1)

In [30]:
credit

Unnamed: 0,Month,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,1,809.98,26.822620,265.0,1,49.574949,21.465380,2,312.494089,0
1,2,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,4.0,...,1,809.98,31.944960,266.0,1,49.574949,21.465380,3,284.629162,0
2,3,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,1,809.98,28.609352,267.0,1,49.574949,21.465380,4,331.209863,0
3,4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,4.0,...,1,809.98,31.377862,268.0,1,49.574949,21.465380,5,223.451310,0
4,5,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,4.0,...,1,809.98,24.797347,269.0,1,49.574949,21.465380,1,341.489231,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,23.0,7.0,...,1,502.38,34.663572,378.0,1,35.104023,24.028477,0,479.866228,1
99996,5,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,18.0,7.0,...,1,502.38,40.565631,379.0,1,35.104023,24.028477,1,496.651610,1
99997,6,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,27.0,6.0,...,1,502.38,41.255522,380.0,1,35.104023,24.028477,0,516.809083,1
99998,7,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,20.0,6.0,...,1,502.38,33.638208,381.0,1,35.104023,24.028477,3,319.164979,2


In [31]:
sd = StandardScaler()
sd.fit(X)

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [33]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)

In [34]:
training_prediction = knn.predict(X_train)
test_prediction = knn.predict(X_test)

In [35]:
y_test

array([0, 1, 0, ..., 1, 1, 2])

In [36]:
from sklearn import metrics
print("Precision, Recall, Confusion matrix, in training\n")

# Precision Recall scores
print(metrics.classification_report(y_train, training_prediction, digits=3))

# Confusion matrix
print(metrics.confusion_matrix(y_train, training_prediction))

Precision, Recall, Confusion matrix, in training

              precision    recall  f1-score   support

           0      0.837     0.874     0.855     13406
           1      0.860     0.906     0.882     21685
           2      0.905     0.865     0.885     39909

    accuracy                          0.879     75000
   macro avg      0.867     0.882     0.874     75000
weighted avg      0.880     0.879     0.879     75000

[[11713    28  1665]
 [   83 19652  1950]
 [ 2198  3181 34530]]


In [37]:
print("Precision, Recall, Confusion matrix, in testing\n")

# Precision Recall scores
print(metrics.classification_report(y_test, test_prediction, digits=3))

# Confusion matrix
print(metrics.confusion_matrix(y_test, test_prediction))

Precision, Recall, Confusion matrix, in testing

              precision    recall  f1-score   support

           0      0.695     0.732     0.713      4422
           1      0.773     0.800     0.787      7313
           2      0.803     0.773     0.787     13265

    accuracy                          0.774     25000
   macro avg      0.757     0.769     0.762     25000
weighted avg      0.775     0.774     0.774     25000

[[ 3239    37  1146]
 [   87  5852  1374]
 [ 1335  1678 10252]]
