In [None]:
pip install pandas numpy seaborn scikit-learn imbalanced-learn matplotlib


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE


In [None]:
# Load the dataset
df = pd.read_csv("loan_data.csv")

# Display the first few records
print(df.head())



In [None]:
# Encode target variable (Loan Status: Approved → 1, Rejected → 0)
label_encoder = LabelEncoder()
df["Loan Status"] = label_encoder.fit_transform(df["Loan Status"])

# One-hot encode Employment Type (drop_first=True to avoid dummy trap)
df = pd.get_dummies(df, columns=["Employment Type"], drop_first=True)


In [None]:
# Define features and target
X = df.drop(columns=["Loan Status", "Applicant ID"])  # Remove target and ID
y = df["Loan Status"]

# Select top 3 best features
selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)

# Shape after feature selection
print("Shape after SelectKBest:", X_new.shape)


In [None]:
# Reduce dimensions to 2 using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_new)

print("Shape after PCA:", X_pca.shape)


In [None]:
# Check class distribution before SMOTE
print("Before SMOTE:\n", y.value_counts())

# Apply SMOTE for oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_pca, y)

# Check class distribution after SMOTE
print("After SMOTE:\n", y_resampled.value_counts())


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_resampled[:, 0], y=X_resampled[:, 1], hue=y_resampled, palette="Set2")
plt.title("PCA of Loan Application Data (After SMOTE)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()
