In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
!mkdir data
%pip install kaggle
!cd data
!kaggle datasets download -d parisrohan/credit-score-classification
!unzip -f credit-score-classification.zip

In [None]:
train_df = pd.read_csv("data/train.csv", decimal=".", engine="python")
test_df = pd.read_csv("data/test.csv", decimal=".", engine="python")
train_df

# EDA

Getting just a base information about our dataset (types of columns, how much NaNs we have etc)

In [None]:
train_df.info()

It seems that lots of numeric features were not casted to correct data type. Because of that, we need manually cast following columns to right type:

In [None]:
problem_columns_float = [
    "Annual_Income",
    "Changed_Credit_Limit",
    "Outstanding_Debt",
    "Total_EMI_per_month",
    "Amount_invested_monthly",
    "Monthly_Balance",
]
train_df[problem_columns_float]

In [None]:
train_df[problem_columns_float] = train_df[problem_columns_float].apply(
    pd.to_numeric, errors="coerce"
)
train_df[problem_columns_float] = train_df[problem_columns_float].astype("float64")
train_df[problem_columns_float].fillna(
    value=train_df[problem_columns_float].median(),
    inplace=True,
)
train_df[problem_columns_float]

In [None]:
problem_columns_int = ["Num_of_Loan", "Num_of_Delayed_Payment", "Age"]
train_df[problem_columns_int]

In [None]:
train_df[problem_columns_int] = train_df[problem_columns_int].apply(
    pd.to_numeric, errors="coerce"
)
train_df[problem_columns_int] = train_df[problem_columns_int].astype(pd.Int32Dtype())
train_df[problem_columns_int].fillna(
    value=train_df[problem_columns_int].median(), inplace=True
)

In [None]:
train_df[["Num_of_Loan", "Num_of_Delayed_Payment"]]

Prepare separate variables for different column types:

In [None]:
number_columns = train_df.select_dtypes("number").columns
categorical_columns = train_df.select_dtypes("object").columns.drop("Credit_Score")
feature_columns = categorical_columns.union(number_columns)
target_column = "Credit_Score"
number_columns, categorical_columns

Finding nulls in data:

In [None]:
train_df.isnull().sum()

`Name` column is not important (as ID columns), so we can drop them. Let's create auxillary function to delete column from Index variable and from Dataframe:

In [None]:
def delete_col(cols):
    global train_df, categorical_columns, number_columns
    train_df.drop(columns=cols, inplace=True)

    for col in cols:
        if col in categorical_columns:
            categorical_columns = categorical_columns.drop(col)
        else:
            number_columns = number_columns.drop(col)


delete_col(["Name", "ID", "Customer_ID", "SSN"])

Although usage of `global` is not recommended, in this case it is okay.

In [None]:
train_df["Credit_History_Age"].value_counts()

## `Credit_History_Age`

In [None]:
train_df["Credit_History_Age"]

Better to transform this column to Datetime, so One-Hot Encoding will not create an abnormal amount of columns.

In [None]:
split_credit_history = train_df["Credit_History_Age"].str.extract(
    r"(\d+)\sYears\sand\s(\d+)\sMonths"
)

total_months = split_credit_history[0].astype(
    pd.Int32Dtype()
) * 12 + split_credit_history[1].astype(pd.Int32Dtype())

train_df["Credit_History_Age"] = total_months
total_months

## `Type_of_Loan`

In [None]:
train_df["Type_of_Loan"].value_counts().head(20)

In [None]:
loan_types = [
    "Not Specified",
    "Credit-Builder Loan",
    "Personal Loan",
    "Debt Consolidation Loan",
    "Student Loan",
    "Payday Loan",
    "Mortgage Loan",
    "Auto Loan",
    "Home Equity Loan",
]

train_df["Type_of_Loan"].fillna("", inplace=True)
for suffix in loan_types:
    train_df["Type_of_Loan_" + suffix] = train_df["Type_of_Loan"].apply(
        lambda x: suffix in x.split(", ")
    )

delete_col(["Type_of_Loan"])

In [None]:
train_df

In [None]:
categorical_columns = categorical_columns.drop("Credit_History_Age")
number_columns = number_columns.append(pd.Index(["Credit_History_Age"]))
number_columns, categorical_columns

In [None]:
train_df = pd.get_dummies(
    train_df,
    columns=[
        "Month",
        "Occupation",
        "Credit_Mix",
        "Payment_of_Min_Amount",
        "Payment_Behaviour",
        "Credit_Score"
    ],
    drop_first=True,
)
train_df

In [None]:
train_df.info()

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=2)

train_df_knn_imputed = knn_imputer.fit_transform(train_df)

In [None]:
train_df.describe()

In [None]:
import seaborn as sns

# Baseline

In [None]:
from sklearn.dummy import DummyClassifier

# dummy_classifier = DummyClassifier(random_state=42)

# dummy_classifier.fit-