In [None]:
df['DTI_Level'] = pd.cut(df['DTIRatio'], bins=[-np.inf, 0.2, 0.4, 0.6, 0.8, np.inf], labels=[0,1,2,3,4])
df['DTI_Level'] = df['DTI_Level'].astype(int)

In [None]:
df['IncomeToLoanRatio'] = df['Income'] / (df['LoanAmount'] + 1e-5)

In [None]:
df['LoanPurposeGroup'] = df['LoanPurpose'].replace({
    0: 0,  # "Auto"
    1: 1,  # "Business"
    2: 2,  # "Education"
    3: 3,  # "Home"
    4: 4   # "Other"
})

In [None]:
df['Age_Income'] = df['Age'] * df['Income']
df['CreditScore_Interest'] = df['CreditScore'] * df['InterestRate']

In [None]:
from sklearn.model_selection import cross_val_score
"""
model = RandomForestClassifier()
scores = cross_val_score(model, df.drop("Default", axis=1), df["Default"], cv=5, scoring='f1')
print("F1 Score with engineered features:", scores.mean())
"""

In [None]:
import matplotlib.ticker as mtick

In [None]:
# Check class distribution
plt.figure(figsize=(5, 4))
sns.countplot(data=df, x='Default', palette='Set2')
plt.title("Default Distribution")
plt.xticks([0, 1], ['Non-Default (0)', 'Default (1)'])
plt.ylabel("Count")
plt.show()

In [None]:
#  Default rate
default_rate = df['Default'].mean()
print(f" Overall Default Rate: {default_rate:.2%}")

In [None]:
# Numeric Feature Distributions by Default
num_features = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'InterestRate', 'MonthsEmployed', 'DTIRatio']

for feature in num_features:
    plt.figure(figsize=(6, 4))
    sns.kdeplot(data=df, x=feature, hue='Default', fill=True, common_norm=False, palette='coolwarm')
    plt.title(f'Distribution of {feature} by Default Status')
    plt.show()

In [None]:
# Boxplots to see outliers and trends
for feature in ['Income', 'LoanAmount', 'CreditScore', 'InterestRate']:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='Default', y=feature, palette='Set2')
    plt.title(f'{feature} vs Default')
    plt.show()

In [None]:
# Categorical vs Default (barplots)
cat_features = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

for col in cat_features:
    plt.figure(figsize=(6, 4))
    default_rates = df.groupby(col)['Default'].mean().reset_index()
    sns.barplot(x=col, y='Default', data=default_rates, palette='Blues_d')
    plt.title(f'Default Rate by {col}')
    plt.xticks(rotation=45)
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(12, 10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='magma', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Compute correlation matrix
correlation_matrix = df.corr()

# Focus on correlation with 'Default'
default_corr = correlation_matrix['Default'].drop('Default')  # remove self-correlation

# Top 15 most correlated features (absolute value)
top_15_features = default_corr.abs().sort_values(ascending=False).head(15)

# Show actual correlations with sign
top_15_signed = default_corr.loc[top_15_features.index]

print("Top 15 Features Most Correlated with Default:\n")
print(top_15_signed)