In [None]:
pip install pandas numpy seaborn matplotlib scikit-learn imbalanced-learn


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE


In [None]:
df = pd.read_csv("loan_default_data.csv")
print(df.head())
print(df.info())


In [None]:
# Target variable
target = "Loan Default"  # Yes/No

# Separate numerical and categorical features
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


In [None]:
# Fill numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
label_encoder = LabelEncoder()

# Label Encode binary categorical features
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2 and col != target]

for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])

# One-hot encode other categorical columns
multi_class_cols = [col for col in categorical_cols if col not in binary_cols + [target]]
df = pd.get_dummies(df, columns=multi_class_cols, drop_first=True)

# Encode the target
df[target] = label_encoder.fit_transform(df[target])  # Yes → 1, No → 0


In [None]:
X = df.drop(columns=[target])
y = df[target]

# Select top 8 features using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=8)
X_kbest = selector.fit_transform(X, y)

print("Shape after SelectKBest:", X_kbest.shape)


In [None]:
# Use DecisionTree for RFE
rfe_model = DecisionTreeClassifier(random_state=42)
rfe_selector = RFE(estimator=rfe_model, n_features_to_select=8)
rfe_selector.fit(X, y)

# Get selected features
selected_features = X.columns[rfe_selector.support_]
print("RFE Selected Features:", selected_features.tolist())


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# PCA Visualization
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm')
plt.title("PCA: Loan Default Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()


In [None]:
lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_scaled, y)

# LDA Visualization
plt.figure(figsize=(8, 4))
sns.histplot(X_lda.ravel(), hue=y, bins=30, kde=True, element='step', palette='coolwarm')
plt.title("LDA: Loan Default Separation")
plt.xlabel("LDA Component")
plt.show()


In [None]:
print("Before SMOTE:\n", y.value_counts())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("After SMOTE:\n", pd.Series(y_resampled).value_counts())
