In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [None]:
df = pd.read_csv('loan_data.csv')
print(df.head())
print(df.info())  # Check data types and missing values


In [None]:
print(df.isnull().sum())


In [None]:
numeric_features = ['LoanAmount', 'Income', 'CreditScore']
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])


In [None]:
categorical_features = ['EmploymentStatus', 'LoanPurpose']
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_features] = categorical_imputer.fit_transform(df[categorical_features])


In [None]:
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)


In [None]:
for col in numeric_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    filter = (df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)
    df = df.loc[filter]


In [None]:
X = df.drop('Default', axis=1)
y = df['Default']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')


In [None]:
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])


In [None]:
print(X_train.head())
print(y_train.head())


In [None]:
train_data = X_train.copy()
train_data['Default'] = y_train
train_data.to_csv('loan_data_preprocessed_train.csv', index=False)
