In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# You may need to install this library: pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# Load your specific dataset
FILE_PATH = r'Loan_Default.csv'
TARGET_COLUMN = 'Status'

df = pd.read_csv(FILE_PATH)

# Drop identifier columns that are not useful for prediction
df = df.drop(['ID', 'year'], axis=1)

print("✅ Dataset loaded and identifier columns dropped.")
print(f"Shape of the dataset: {df.shape}")

✅ Dataset loaded and identifier columns dropped.
Shape of the dataset: (148670, 32)


Exploring the Data

In [None]:
Exploring the Data

print("\nFirst 5 rows:")
print(df.head())

print("\n\nData types and missing values:")
df.info()

print(f"\n\nClass distribution for '{TARGET_COLUMN}':")
print(df[TARGET_COLUMN].value_counts())

Features/Target and Split

In [5]:
X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (118936, 31)
Testing set shape: (29734, 31)


Preprocessing Pipeline

In [6]:
# Identify numerical and categorical columns automatically
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nFound {len(numeric_features)} numerical features.")
print(f"Found {len(categorical_features)} categorical features.")

# Create pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine these pipelines into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("\n✅ Preprocessing pipeline created successfully.")


Found 10 numerical features.
Found 21 categorical features.

✅ Preprocessing pipeline created successfully.


Preprocessing the Data

In [7]:
# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Only transform the test data (to avoid data leakage)
X_test_processed = preprocessor.transform(X_test)

print("\nData preprocessing complete.")
print(f"Shape of processed training data: {X_train_processed.shape}")


Data preprocessing complete.
Shape of processed training data: (118936, 69)


Addressing Class Imbalance

In [8]:
print("\nClass distribution in TRAINING data BEFORE SMOTE:")
print(y_train.value_counts())

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

print("\nClass distribution in TRAINING data AFTER SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

print("\n✅ Your data is now fully preprocessed and ready for model training.")


Class distribution in TRAINING data BEFORE SMOTE:
Status
0    89625
1    29311
Name: count, dtype: int64

Class distribution in TRAINING data AFTER SMOTE:
Status
0    89625
1    89625
Name: count, dtype: int64

✅ Your data is now fully preprocessed and ready for model training.
