In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from scipy.stats import zscore
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Supervised Learning Mini Project/Cleaned Train.csv')
df.head()

Unnamed: 0,ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,1,23,12,19114.12,1824.843333,3,4,3,4,...,4,1,809.98,26.82262,1,49.574949,80.415295,3,312.494089,2
1,0x1603,2,23,12,19114.12,1824.843333,3,4,3,4,...,4,1,809.98,31.94496,1,49.574949,118.280222,4,284.629162,2
2,0x1604,3,23,12,19114.12,1824.843333,3,4,3,4,...,4,1,809.98,28.609352,1,49.574949,81.699521,5,331.209863,2
3,0x1605,4,23,12,19114.12,1824.843333,3,4,3,4,...,4,1,809.98,31.377862,1,49.574949,199.458074,6,223.45131,2
4,0x1606,5,23,12,19114.12,1824.843333,3,4,3,4,...,4,1,809.98,24.797347,1,49.574949,41.420153,2,341.489231,2


In [12]:
df.set_index('ID', drop=True, inplace=True)

X = df.drop('Credit_Score', axis=1)
y = df[['Credit_Score']]
y['Credit_Score'] = y['Credit_Score'].astype('category')

In [13]:
# Converting Cateogry columns to 'category' data type
cat_col = ['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
for i in cat_col:
    X[i] = X[i].astype('category')

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 0x1602 to 0x25fed
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   Month                     100000 non-null  category
 1   Age                       100000 non-null  int64   
 2   Occupation                100000 non-null  category
 3   Annual_Income             100000 non-null  float64 
 4   Monthly_Inhand_Salary     100000 non-null  float64 
 5   Num_Bank_Accounts         100000 non-null  int64   
 6   Num_Credit_Card           100000 non-null  int64   
 7   Interest_Rate             100000 non-null  int64   
 8   Num_of_Loan               100000 non-null  int64   
 9   Delay_from_due_date       100000 non-null  int64   
 10  Num_of_Delayed_Payment    100000 non-null  int64   
 11  Changed_Credit_Limit      100000 non-null  float64 
 12  Num_Credit_Inquiries      100000 non-null  int64   
 13  Credit_Mix                10

In [15]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X, y, test_size=0.15, random_state=42)
X_train_final.reset_index(drop=True, inplace=True)
X_test_final.reset_index(drop=True, inplace=True)
y_train_final.reset_index(drop=True, inplace=True)
y_test_final.reset_index(drop=True, inplace=True)

In [16]:
estimators = [DecisionTreeClassifier(criterion='gini', max_depth=7, max_features=21, min_samples_leaf=8),
              RandomForestClassifier(oob_score=True, n_estimators=150, min_samples_leaf=8, max_features=21, max_depth=7, criterion='gini'),
              GradientBoostingClassifier(validation_fraction=0.2, n_estimators=200, min_samples_leaf=7, max_features=21, max_depth=9,
                                         criterion='squared_error')]

all_predictions = pd.DataFrame()
trained_models = []
kf = KFold(10, shuffle=True)
checker = 0
for i in tqdm(range(len(estimators))):
    for train_index, test_index in kf.split(X_train_final):
        X_train, X_test = X_train_final.loc[train_index], X_train_final.loc[test_index]
        y_train, y_test = y_train_final.loc[train_index], y_train_final.loc[test_index]

        model = estimators[i]
        m = model.fit(X_train, y_train)
        trained_models.append(m)
        checker += 1
        print(checker, end='')

        model_pred_train = pd.DataFrame(model.predict(X_train), index=train_index)
        model_pred_test = pd.DataFrame(model.predict(X_test), index=test_index)

        a = pd.concat([model_pred_train, model_pred_test], axis=0)
        all_predictions = pd.concat([all_predictions, a.sort_index()], axis=1)

all_predictions.columns = list(range(30))

  0%|          | 0/3 [00:00<?, ?it/s]

123456789

 33%|███▎      | 1/3 [00:08<00:17,  8.86s/it]

1011121314151617181920

 67%|██████▋   | 2/3 [14:24<08:27, 507.25s/it]

21222324252627282930

100%|██████████| 3/3 [2:01:52<00:00, 2437.62s/it]


In [17]:
all_predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84995,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
84996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84997,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
84998,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [18]:
trained_models[0]

In [20]:
y_train_final['Credit_Score'] = y_train_final['Credit_Score'].astype('int64')
y_test_final['Credit_Score'] = y_test_final['Credit_Score'].astype('int64')

In [21]:
# Main Model
model_main = XGBClassifier(subsample=0.6, scale_pos_weight=1, reg_lambda=0, reg_alpha=0.1,
                           n_estimators=150, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0, colsample_bytree=1.0)
model_main.fit(all_predictions, y_train_final)

In [22]:
X_test_final.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,2,30,8,43391.96,3688.996667,1,5,10,0,9,...,10.72,0,1,1468.28,29.112468,1,0.0,53.969385,1,554.930282
1,1,18,12,14351.21,1274.934167,8,7,18,6,18,...,21.91,9,0,4766.87,39.849732,2,61.098718,118.080214,6,238.314485
2,1,41,2,130125.04,10871.753333,5,2,11,2,3,...,3.82,3,1,1195.05,21.542892,1,204.28565,1255.899302,6,819.979411
3,4,35,8,20107.21,1631.600833,6,6,32,2,15,...,10.4,11,2,2544.6,32.423759,2,27.106395,66.819775,6,359.233914
4,8,27,3,92186.19,7487.1825,1,2,12,4,1,...,9.74,4,1,809.01,39.543131,1,185.797654,115.520039,2,697.400557


In [23]:
test_predictions = pd.DataFrame(columns=list(range(30)))

dummy = []
for i in range(len(X_test_final)):
    for j in range(len(trained_models)):
        dummy.append(trained_models[j].predict(X_test_final[X_test_final.index == i])[0])
    test_predictions.loc[len(test_predictions.index)] = dummy
    dummy = []

In [24]:
test_predictions.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [25]:
y_pred = pd.DataFrame(model_main.predict(test_predictions), columns=['Credit_Score'], index=y_test_final.index)

In [26]:
y_test_final

Unnamed: 0,Credit_Score
0,2
1,0
2,2
3,0
4,2
...,...
14995,0
14996,1
14997,2
14998,2


In [27]:
print(y_pred.shape)
print(y_test_final.shape)

(15000, 1)
(15000, 1)


In [28]:
model_main.score(test_predictions, y_test_final)

0.8068

In [29]:
confusion_matrix(y_test_final, y_pred)

array([[3536,  842,   43],
       [ 855, 6560,  537],
       [  14,  607, 2006]])