In [1]:
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
seed = 42

# Data preparation

In [3]:
def clean_df(df):
    return df

def prep_features(df):
    df = df.drop(["Credit_Score", "ID", "Customer_ID", "SSN"], axis=1, errors='ignore')

    dummy_cols = ["Credit_Mix", "Payment_of_Min_Amount", "Payment_Behaviour"]
    dummies_df = pd.get_dummies(df[dummy_cols], prefix=dummy_cols, drop_first=True)
    df = df.drop(dummy_cols, axis=1)
    df = pd.concat([df, dummies_df], axis=1)

    def parse(value):
        parts = value.split()
        years = int(parts[0])
        months = int(parts[3])
        return years * 12 + months
    df["Credit_History_Age"] = df["Credit_History_Age"].apply(parse)

    df = df.drop(df.select_dtypes(include='object'), axis=1)

    return df

In [4]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [5]:
df_train.head()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Total_EMI_per_month,Credit_Mix_Good,Credit_Mix_Standard,Credit_Mix__,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
0,4997.53,9,7,19,58,8.0,31.737712,166,339.971582,False,False,True,False,True,False,False,True,False,False,False
1,1334.804167,8,583,22,20,9.0,27.414562,27,67.975819,False,False,False,False,True,False,False,True,False,False,False
2,8600.05,8,6,4,18,2.0,36.51616,351,143.856735,True,False,False,True,False,True,False,False,False,False,False
3,11888.193333,5,2,4,3,2.0,36.955252,369,182.160424,True,False,False,True,False,False,True,False,False,False,False
4,1851.335,4,3,6,10,0.0,28.542585,242,25.580281,False,False,True,True,False,False,False,False,True,False,False


# Exploratory data analysis

In [6]:
df_train.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Total_EMI_per_month
count,44207.0,44207.0,44207.0,44207.0,44207.0,44207.0,44207.0,44207.0,44207.0
mean,4019.547959,16.771281,23.223109,77.300925,22.054652,26.791504,32.21341,213.094736,1451.435871
std,3092.839667,114.890173,130.872496,483.025792,15.253947,185.571018,5.055008,100.125007,8446.131137
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,21.264254,2.0,4.462837
25%,1574.695833,4.0,4.0,8.0,10.0,3.0,28.035357,134.0,41.132527
50%,2986.346667,6.0,6.0,15.0,19.0,6.0,32.239814,212.0,78.369329
75%,5704.22,8.0,7.0,22.0,29.0,9.0,36.402076,292.0,168.955552
max,15204.633333,1798.0,1499.0,5789.0,67.0,2594.0,49.564519,404.0,82331.0


# Models

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df_train, df["Credit_Score"], test_size=0.2, random_state=seed)

In [8]:
def evaluate(clf):
    scores = cross_val_score(clf, X_train, y_train, cv=3)
    return np.mean(scores) - np.std(scores)

In [9]:
rf = RandomForestClassifier(n_estimators=300, random_state=seed)

rf.fit(X_train, y_train)
print(rf.score(X_val, y_val))

evaluate(rf)

0.744288622483601


np.float64(0.7215864193836834)

In [10]:
clf = rf

# Submission

In [11]:
test_df = pd.read_csv("test_data.csv")
test_df = clean_df(test_df)
features = prep_features(test_df)

In [12]:
subtask1 = len(df_train)
subtask2 = math.floor(df_train[df_train["Credit_Utilization_Ratio"] >= 25]["Monthly_Inhand_Salary"].mean())
subtask3 = df["Month"].nunique()
subtask4 = df["SSN"][df["SSN"].str.endswith('20')].nunique()

In [13]:
subtasks12 = pd.DataFrame({
    "subtaskID": [1, 2, 3,4],
    "datapointID": 1,
    "answer": [subtask1, subtask2, subtask3, subtask4]
})

subtask3 = pd.DataFrame({
    "subtaskID": 5,
    "datapointID": test_df["ID"],
    "answer": clf.predict(features)
})

submission = pd.concat([subtasks12, subtask3])
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,44207
1,2,1,4106
2,3,1,8
3,4,1,109
0,5,0x1c2a8,0


In [14]:
submission.to_csv("submission.csv", index=False)