In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

plt.style.use("ggplot")
sns.set_style("dark")

In [None]:
# !mkdir -p data
# %pip install kaggle
# !kaggle datasets download -d parisrohan/credit-score-classification -p data/

In [None]:
!unzip -u data/credit-score-classification.zip -d data

In [None]:
train_df = pd.read_csv("data/train.csv", decimal=".", engine="python") \
.sample(1000, random_state=42) # working on a slow laptop
test_df = pd.read_csv("data/test.csv", decimal=".", engine="python")
train_df

# EDA
Objective: Start by getting a basic understanding of the dataset. This includes checking the data types of columns, identifying missing values (NaNs), and understanding the overall structure of the data.

In [None]:
train_df.info()

How many unique clients do we have?

In [None]:
train_df["ID"].value_counts()

Identifying columns that are incorrectly typed (e.g., numerical data stored as strings) and casting them to the correct data types. This is important for accurate analysis and modeling.

In [None]:
problem_columns_float = [
    "Annual_Income",
    "Changed_Credit_Limit",
    "Outstanding_Debt",
    "Total_EMI_per_month",
    "Amount_invested_monthly",
    "Monthly_Balance",
]
train_df[problem_columns_float]

In [None]:
train_df[problem_columns_float] = train_df[problem_columns_float].apply(
    pd.to_numeric, errors="coerce"
)
train_df[problem_columns_float] = train_df[problem_columns_float].astype("float64")
train_df.loc[:, problem_columns_float].fillna(
    value=train_df[problem_columns_float].median(),
    inplace=True,
)
train_df[problem_columns_float]

In [None]:
problem_columns_int = ["Num_of_Loan", "Num_of_Delayed_Payment", "Age"]
train_df[problem_columns_int]

In [None]:
train_df[problem_columns_int] = train_df[problem_columns_int].apply(
    pd.to_numeric, errors="coerce"
)
train_df[problem_columns_int] = train_df[problem_columns_int].astype(pd.Int32Dtype())
train_df.loc[:, problem_columns_int].fillna(
    value=train_df[problem_columns_int].median(), inplace=True
)
train_df[problem_columns_int]

Prepare separate variables for different column types:

In [None]:
number_columns = train_df.select_dtypes("number").columns
categorical_columns = train_df.select_dtypes("object").columns.drop("Credit_Score")
feature_columns = categorical_columns.union(number_columns)
target = train_df["Credit_Score"]
number_columns, categorical_columns

In [None]:
train_df.describe()

In [None]:
train_df.info()

## Some outliers

Identify and address outliers in the dataset, such as negative values where they don't make sense (e.g., age cannot be negative), or unrealistic values like an age of 8698.

In [None]:
train_df = train_df.drop(
    train_df[
        (train_df["Age"] < 0)
        | (train_df["Age"] > 100)
        | (train_df["Num_Bank_Accounts"] < 0)
        | (train_df["Num_of_Loan"] < 0)
        | (train_df["Num_of_Delayed_Payment"] < 0)
        | (train_df["Delay_from_due_date"] < 0)
    ].index
)
train_df

## Various plots
Using visualizations like box plots and pie charts to get insights into the distribution of data, detect outliers, and understand categorical distributions.

### Box plots

In [None]:
box_data = train_df[
    number_columns
]

nrows = 4

fig, axes = plt.subplots(nrows, len(number_columns)//nrows, figsize=(15, 10))

for i, column in enumerate(box_data.columns):
    row = i // (len(number_columns) // nrows)  
    col = i % (len(number_columns) // nrows) 


    non_nan_data = box_data[column].dropna()
    axes[row, col].boxplot(non_nan_data, vert=False)
    axes[row, col].set_xlabel(column)

plt.tight_layout()
plt.show()

This box plot shows that there are many outliers (points outside the whiskers), indicating possible data issues or extreme variability.

### Pie charts

In [None]:
train_df[
    [
        "Month",
        "Occupation",
        "Type_of_Loan",
        "Credit_Mix",
        "Payment_of_Min_Amount",
        "Payment_Behaviour",
        "Credit_Score",
    ]
].value_counts()

In [None]:
pie_data = train_df[
    [
        "Month",
        "Occupation",
        "Credit_Mix",
        "Payment_of_Min_Amount",
        "Payment_Behaviour",
        "Credit_Score",
    ]
]

nrows = 2
ncols = 3  
fig, axes = plt.subplots(nrows, ncols, figsize=(15, 8))

for i, column in enumerate(pie_data.columns):
    row = i // ncols
    col = i % ncols

    grouped_data = pie_data.groupby(column).size().reset_index(name="counts")

    # Extract data for the pie chart
    labels = grouped_data[column]
    sizes = grouped_data["counts"]

    # Plot pie chart in the correct subplot
    axes[row, col].pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
    axes[row, col].set_xlabel(column)

plt.tight_layout()
plt.show()

This pie chart shows the distribution of categories within 'categorical_column'. It reveals that some categories dominate the dataset, while others are underrepresented.

#### Donut charts (nested pie charts)

In [None]:
pie_data = train_df[
    [
        "Month",
        "Occupation",
        "Credit_Mix",
        "Payment_of_Min_Amount",
        "Payment_Behaviour",
        "Credit_Score",
    ]
]

# Group by 'Credit_Score' for the inner layer
inner_group = pie_data.groupby('Credit_Score').size().reset_index(name='counts')
inner_labels = inner_group['Credit_Score']
inner_sizes = inner_group['counts']

# Define the outer layers (columns other than 'Credit_Score')
outer_columns = ["Month", "Occupation", "Credit_Mix", "Payment_of_Min_Amount", "Payment_Behaviour"]

nrows = 2
ncols = 3

fig, axes = plt.subplots(nrows, ncols, figsize=(25, 10))

for i, column in enumerate(outer_columns):
    outer_group = pie_data.groupby(['Credit_Score', column]).size().reset_index(name='counts')
    
    row = i // ncols
    col = i % ncols

    # Create the outer labels and sizes based on the groups
    outer_labels = outer_group[column]
    outer_sizes = outer_group['counts']
    
    # Create the donut chart
    axes[row, col].pie(inner_sizes, labels=inner_labels, radius=1, wedgeprops=dict(width=0.3, edgecolor='w'), autopct="%1.1f%%", startangle=90)
    axes[row, col].pie(outer_sizes, labels=outer_labels, radius=1.3, wedgeprops=dict(width=0.3, edgecolor='w'), autopct="%1.1f%%", startangle=90)
    
    # Add title for the outer layer
    axes[row, col].set_title(column)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

## Dealing with Missing Values & Feature Extraction
**Objective**: Identify columns with missing values and decide how to handle them (e.g., drop, fill, or impute). Also, extract new features from existing data to enhance the dataset.

Finding nulls in data:

In [None]:
train_df.isnull().sum()

The 'Name' column is not crucial for our analysis since we have 'ID' as a unique identifier. We can safely drop the 'Name' column.

In [None]:
def delete_col(cols):
    """
    Drops a specified column from the DataFrame and its associated index variable.
    """
    global train_df, categorical_columns, number_columns
    train_df.drop(columns=cols, inplace=True)

    for col in cols:
        if col in categorical_columns:
            categorical_columns = categorical_columns.drop(col)
        else:
            number_columns = number_columns.drop(col)


delete_col(["Name", "Customer_ID", "SSN"])

Although usage of `nonlocal` is not recommended, in this case it is okay.

In [None]:
train_df["Credit_History_Age"].value_counts()

### Transforming `Credit_History_Age`

In [None]:
train_df["Credit_History_Age"]

Convert the `Credit_History_Age` column to a int format to facilitate analysis and avoid creating too many columns during one-hot encoding.

In [None]:
split_credit_history = train_df["Credit_History_Age"].str.extract(
    r"(\d+)\sYears\sand\s(\d+)\sMonths"
)

total_months = split_credit_history[0].astype(
    pd.Int32Dtype()
) * 12 + split_credit_history[1].astype(pd.Int32Dtype())

train_df["Credit_History_Age"] = total_months
total_months

### Handling `Type_of_Loan`
By splitting and exploding 'Type_of_Loan', we can handle cases where a single individual has multiple loans, making the data more granular and accurate for analysis.

In [None]:
train_df["Type_of_Loan"].value_counts().head(20)

In [None]:
loan_types = [
    "Not Specified",
    "Credit-Builder Loan",
    "Personal Loan",
    "Debt Consolidation Loan",
    "Student Loan",
    "Payday Loan",
    "Mortgage Loan",
    "Auto Loan",
    "Home Equity Loan",
]

train_df["Type_of_Loan"].fillna("", inplace=True)
for suffix in loan_types:
    train_df["Type_of_Loan_" + suffix] = train_df["Type_of_Loan"].apply(
        lambda x: suffix in x.split(", ")
    )

delete_col(["Type_of_Loan"])

In [None]:
train_df

In [None]:
categorical_columns = categorical_columns.drop("Credit_History_Age")
number_columns = number_columns.append(pd.Index(["Credit_History_Age"]))
number_columns, categorical_columns

In [None]:
train_df = pd.get_dummies(
    train_df,
    columns=[
        "Month",
        "Occupation",
        "Credit_Mix",
        "Payment_of_Min_Amount",
        "Payment_Behaviour",
    ],
    drop_first=True,
)
train_df = pd.get_dummies(train_df, columns=["Credit_Score"])
train_df

In [None]:
train_df.info()

In [None]:
from sklearn.impute import KNNImputer, SimpleImputer

id_column = train_df['ID']
features = train_df.drop(columns=['ID'])

knn_imputer = KNNImputer(
    n_neighbors=1
)  # at least 1 neighbor, because running one cell for 7 minutes is too harsh
# knn_imputer = SimpleImputer(strategy='median')


X_knn_imputed = knn_imputer.fit_transform(features)
X_knn_imputed

In [None]:
train_df_knn_imputed = pd.DataFrame(X_knn_imputed, columns=features.columns, index=features.index).astype(features.dtypes.to_dict())
result_df = train_df_knn_imputed.copy()
# result_df["ID"] = id_column.reset_index(drop=True)
result_df

In [None]:
id_column

In [None]:
train_df_knn_imputed.info()

# Baseline
Creating a simple baseline model before diving into complex modeling. This helps to understand the minimum performance we can expect.

In [None]:
preprocessed_df = train_df_knn_imputed.drop(
        columns=["Credit_Score_Good", "Credit_Score_Standard", "Credit_Score_Poor"]
    )
preprocessed_df

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_df,
    target[preprocessed_df.index],
    test_size=0.33,
    random_state=42,
)

dummy_classifier = DummyClassifier(random_state=42)
dummy_classifier.fit(X_train, y_train)
dummy_classifier.predict(X_test)

In [None]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, dummy_classifier.predict(X_test))

# Saving preprocessed data
Jupyter provides a %store magic command to pass variables between notebooks.

In [None]:
%store X_train X_test y_train y_test