# Credit Risk Classification

## Initialization

In [3]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Data Preparation

In [5]:
import pandas as pd

def load_data(fname: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame and display its dimensions.
    
    :param fname: The file path or buffer of the CSV file to be read.
    :type fname: str
    :return: A DataFrame containing the loaded data.
    :rtype: pandas.DataFrame
    """
    data = pd.read_csv(fname)
    print(f"Data Shape: [{data.shape}]")
    return data

In [6]:
FNAME = "./data/raw/credit_risk_dataset.csv"
data = load_data(fname=FNAME)
data.head()

Data Shape: [(32581, 12)]


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


### Split Feature and Target Dataset

In [16]:
def split_feature_target(
        data: pd.DataFrame,
        target_col="loan_status"
    ) -> tuple[pd.DataFrame, pd.Series]:
    """
    Split a DataFrame into features (X) and target (y).

    :param data: Input DataFrame.
    :type data: pd.DataFrame
    :param target_col: Target column name, defaults to "loan_status".
    :type target_col: str, optional
    :return: Feature set (X) and target series (y).
    :rtype: typle[pd.DataFrame, pd.Series]
    """
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    print(f"Original data shape: {data.shape}")
    print(f"X data shape: {X.shape}")
    print(f"y data shape: {y.shape}")
    return X, y

In [17]:
TARGET_COL = "loan_status"
X, y = split_feature_target(data=data, target_col=TARGET_COL)
X.head()

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4


### Split Train and Test Dataset

In [22]:
from sklearn.model_selection import train_test_split

def split_train_test(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float,
    random_state: int | None=None
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Split the dataset into X_train, X_test, y_train, y_test.

    :param X: A feature dataset.
    :type X: pd.DataFrame
    :param y: A target dataset.
    :type y: pd.Series
    :param test_size: Represents the number of test samples.
    :type test_size: float
    :param random_state: Controls the shuffling applied to the data, defaults to None.
    :type random_state: int, optional
    :return: X_train, X_test, y_train, y_test. In that order.
    :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )
    print(f"X train shape: {X_train.shape}")
    print(f"X test shape: {X_test.shape}")
    print(f"y test shape: {y_train.shape}")
    print(f"y test shape: {y_test.shape}\n")
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_non_train, y_train, y_not_train = split_train_test(X, y, 0.2, random_state=42)
X_valid, X_test, y_valid, y_test = split_train_test(X_non_train, y_not_train, 0.5, random_state=42)

X train shape: (26064, 11)
X test shape: (6517, 11)
y test shape: (26064,)
y test shape: (6517,)

X train shape: (3258, 11)
X test shape: (3259, 11)
y test shape: (3258,)
y test shape: (3259,)



### Serialize Data

In [24]:
import joblib

def serialize_data(data: pd.DataFrame | pd.Series, path: str):
    """
    Serialize the input into a file.

    :param data: Data to be serialized.
    :type data: pd.DataFrame | pd.Series
    :param path: File path.
    :type path: str
    """
    joblib.dump(data, filename=path)

In [25]:
DATA_PATH = "./data/raw"
serialize_data(X_train, f"{DATA_PATH}/X_train.pkl")
serialize_data(y_train, f"{DATA_PATH}/y_train.pkl")
serialize_data(X_test, f"{DATA_PATH}/X_test.pkl")
serialize_data(y_test, f"{DATA_PATH}/y_test.pkl")
serialize_data(X_valid, f"{DATA_PATH}/X_valid.pkl")
serialize_data(y_valid, f"{DATA_PATH}/y_valid.pkl")

### Deserialize Data

In [26]:
def deserialize_data(path: str) -> pd.DataFrame | pd.Series:
    """
    Deserialize a file into DataFrame/Series.

    :param path: File path.
    :type path: str
    :raises TypeError: If the deserialized object is not a pandas type.
    :return: The restored pandas object.
    :rtype: pd.DataFrame | pd.Series
    """
    data = joblib.load(path)

    if not isinstance(data, (pd.DataFrame, pd.Series)):
        raise TypeError(f"Expected DataFrame/Series, got {type(data)}")

    return data