In [26]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

# Ignore warnings
warnings.filterwarnings('ignore')

# Load data
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv("/content/drive/MyDrive/application_record.csv", encoding='utf-8')
record = pd.read_csv("/content/drive/MyDrive/credit_record.csv", encoding='utf-8')

# Data preprocessing
data = data.drop_duplicates('ID', keep='last')
data = data.drop('OCCUPATION_TYPE', axis=1)

# Rename columns for clarity
column_mapping = {
    "CODE_GENDER": "Gender", "FLAG_OWN_CAR": "Own_Car", "FLAG_OWN_REALTY": "Own_Realty",
    "CNT_CHILDREN": "Children_Count", "AMT_INCOME_TOTAL": "Income", "NAME_EDUCATION_TYPE": "Education",
    "NAME_FAMILY_STATUS": "Family_Status", "NAME_HOUSING_TYPE": "Housing_Type", "DAYS_BIRTH": "Birthday",
    "DAYS_EMPLOYED": "Employment_Date", "FLAG_MOBIL": "Own_Mobile", "FLAG_WORK_PHONE": "Own_Work_Phone",
    "FLAG_PHONE": "Own_Phone", "FLAG_EMAIL": "Own_Email", "CNT_FAM_MEMBERS": "Family_Member_Count",
    "NAME_INCOME_TYPE": "Income_Type"
}
data.rename(columns=column_mapping, inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Extract credit card open month information
open_month = record.groupby(["ID"])["MONTHS_BALANCE"].agg(min)
open_month = open_month.rename("begin_month")

# Merging two datasets together
customer_data = data.merge(open_month, how="left", on="ID")

# Categorical Mapping mostly binary style of 1s and 0s
categorical_mappings = {
    "Gender": {"F": 0, "M": 1},
    "Own_Car": {"N": 0, "Y": 1},
    "Own_Realty": {"N": 0, "Y": 1},
    "Income_Type": {"Working": 1, "Commercial associate": 1, "State servant": 1, "Pensioner": 0, "Student": 0},
    "Family_Status": {"Single / not married": "Single", "Separated": "Single", "Widow": "Single", "Civil marriage": "Married", "Married": "Married"},
    "Housing_Type": {'House / apartment': 'House / apartment', 'With parents': 'With parents', 'Municipal apartment': 'House / apartment',
                    'Rented apartment': 'House / apartment', 'Office apartment': 'House / apartment', 'Co-op apartment': 'House / apartment'},
    "Education": {"Secondary / secondary special": "secondary", "Lower secondary": "secondary", "Higher education": "Higher education", "Incomplete higher": "Higher education", "Academic degree": "Academic degree"}
}
for column, mapping in categorical_mappings.items():
    customer_data[column] = customer_data[column].map(mapping)

# Calculate additional features
customer_data["Household_Size"] = customer_data["Children_Count"] + customer_data["Family_Status"].apply(lambda x: 2 if x == "Married" else 1)
customer_data["Age"] = round((customer_data["Birthday"] / 365) * -1)
customer_data["Experience"] = customer_data["Employment_Date"] / 365
customer_data["Experience"] = customer_data["Experience"].apply(lambda v: int(v * -1) if v < 0 else 0)

# Drop unnecessary columns
customer_data = customer_data.drop(columns=['Employment_Date', 'Birthday', 'Children_Count'])

# "One-hot encoding" categorical columns, use for converting to binary/numerical data
columns_to_encode = ['Income_Type', 'Education', 'Family_Status', 'Housing_Type']
customer_data = pd.get_dummies(customer_data, columns=columns_to_encode)

# Calculate z-scores for outlier identification
def calculate_z_scores(df, cols):
    for col in cols:
        df[col + "_z_score"] = (df[col] - df[col].mean()) / df[col].std()
    return df

z_score_columns = ["Income", "Experience", "Household_Size"]
customer_data = calculate_z_scores(df=customer_data, cols=z_score_columns)

# Remove outliers
outlier_filters = (customer_data["Household_Size_z_score"].abs() <= 3.5) & \
                  (customer_data["Experience_z_score"].abs() <= 3.5) & \
                  (customer_data["Income_z_score"].abs() <= 3.5)
customer_data = customer_data[outlier_filters]

# Create DataFrame with credit info for analysis
grouped = record.groupby('ID')
pivot_tb = record.pivot(index='ID', columns='MONTHS_BALANCE', values='STATUS')
pivot_tb['open_month'] = grouped['MONTHS_BALANCE'].min()
pivot_tb['end_month'] = grouped['MONTHS_BALANCE'].max()
pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month'] + 1

# track past dues and other payment history i.e. no_loan
past_due_statuses = ['0', '1', '2', '3', '4', '5']
for status in past_due_statuses:
    pivot_tb[f'pastdue_{status}'] = pivot_tb[pivot_tb.iloc[:, 0:61] == status].count(axis=1)
pivot_tb['no_loan'] = pivot_tb[pivot_tb.iloc[:, 0:61] == 'X'].count(axis=1)

# Merge history analysis data with the customer data
target = pivot_tb[['open_month', 'pastdue_0', 'pastdue_1', 'pastdue_2', 'pastdue_3', 'pastdue_4', 'pastdue_5', 'no_loan']]
customer_apps = customer_data.merge(target, how='inner', on='ID')

# Create a binary target variable, checks for if past due exists, set a binary value via astype
customer_apps['target'] = (customer_apps['pastdue_2'] > 0) | (customer_apps['pastdue_3'] > 0) | (customer_apps['pastdue_4'] > 0) | (customer_apps['pastdue_5'] > 0)
customer_apps['target'] = customer_apps['target'].astype(int)

# Define selected features
selected_features = ["begin_month", "Income", "Experience", "Family_Status_Single", "Family_Member_Count",
                     "Own_Car", "Age", "Gender", "Own_Realty", "Household_Size", "Education_Higher education", "Education_secondary"]

# Split data into training and testing sets
X = customer_apps[selected_features]
y = customer_apps["target"]


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)

# Fixing class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE()
X_balance, Y_balance = smote.fit_resample(X_train, y_train)
X_balance = pd.DataFrame(X_balance, columns=X_train.columns)
Y_balance = pd.DataFrame(Y_balance, columns=["target"])

# Standardize features
scaler = StandardScaler()
scaler.fit(X_balance)
X_train = pd.DataFrame(scaler.transform(X_balance), columns=X_balance.columns)

# Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=12, min_samples_leaf=16)

# Training the Random Forest model
rf_classifier.fit(X_train, Y_balance)

# Make predictions on the test data
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
y_predict = rf_classifier.predict(X_test)

# Print results of predictions
print("Accuracy:", accuracy_score(y_test, y_predict))
print("Precision:", precision_score(y_test, y_predict, average='weighted'))
print("F1 Score:", f1_score(y_test, y_predict, average='weighted'))

Accuracy: 0.9406320119670905
Precision: 0.9717698208558625
F1 Score: 0.9554062516108087
