In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
df=pd.read_csv("training_set.csv")
df['Loan_ID'] = df['Loan_ID'].str.replace("LP", "", regex=True).astype(int)
df_encoded = pd.get_dummies(df['property_Area'], prefix='property_Area')
df = pd.concat([df, df_encoded], axis=1)
df.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)
df_encoded_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, df_encoded_gender], axis=1)
df.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area', 'Loan_Status','Rural','Urban','Semiurban','Female','Male']

# Convert to numerical
for col in categorical_columns:
    df[col] = df[col].astype('category').cat.codes
df.replace(-1, np.nan, inplace=True)

for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
df.isna().sum()

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Loan_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Balance_Income'] = df['Total_Income'] - df['EMI']

numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Loan_Income_Ratio', 'EMI', 'Balance_Income']


from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
vif_features = df[numerical_features].dropna()

# Standardize features for VIF
scaler = StandardScaler()
vif_scaled = scaler.fit_transform(vif_features)

# Calculating VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = vif_features.columns
vif_data["VIF"] = [variance_inflation_factor(vif_scaled, i) for i in range(vif_scaled.shape[1])]


df.drop(columns=['Total_Income', 'CoapplicantIncome', 'Balance_Income', 'EMI'], inplace=True)
df.drop(columns=['property_Area', 'Gender'], inplace=True)
corr2 = df.corr()

target_corr2 = corr2['Loan_Status'].dropna().sort_values(ascending=False)
target_corr2 = target_corr2[abs(target_corr2) > 0.03].sort_values(key=abs, ascending=False)

selected_features = [
    'Credit_History', 'Semiurban', 'Rural', 'Married',
    'Education', 'Loan_Income_Ratio', 'Urban', 'LoanAmount'
]

x = df[selected_features]
y = df['Loan_Status']

x
y

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

x_train
x_val
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# KNN classifier
knn = KNeighborsClassifier(n_neighbors=11)  # You can tune n_neighbors
knn.fit(x_train_scaled, y_train)

# Predictions
y_pred = knn.predict(x_val_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dt = pd.read_csv("testing_set.csv")

dt['Loan_ID'] = dt['Loan_ID'].str.replace("LP", "", regex=True).astype(int)

dt_encoded = pd.get_dummies(dt['property_Area'], prefix='property_Area')
dt = pd.concat([dt, dt_encoded], axis=1)
dt.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)

dt_encoded_gender = pd.get_dummies(dt['Gender'], prefix='Gender')
dt = pd.concat([dt, dt_encoded_gender], axis=1)
dt.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)

categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area','Rural','Urban','Semiurban','Female','Male']

# Convert to numerical
for col in categorical_columns:
    dt[col] = dt[col].astype('category').cat.codes
dt.replace(-1, np.nan, inplace=True)

for col in categorical_columns:
    dt[col].fillna(dt[col].mode()[0], inplace=True)

dt.isna().sum()

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    dt[col].fillna(dt[col].median(), inplace=True)

dt['Credit_History'].fillna(dt['Credit_History'].mode()[0], inplace=True)

dt['Total_Income'] = dt['ApplicantIncome'] + dt['CoapplicantIncome']
dt['Loan_Income_Ratio'] = dt['LoanAmount'] / dt['Total_Income']
dt['EMI'] = dt['LoanAmount'] / dt['Loan_Amount_Term']
dt['Balance_Income'] = dt['Total_Income'] - dt['EMI']

numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Loan_Income_Ratio', 'EMI', 'Balance_Income']

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

vif_features = dt[numerical_features].dropna()

# Standardize features for VIF
scaler = StandardScaler()
vif_scaled = scaler.fit_transform(vif_features)

# Calculating VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = vif_features.columns
vif_data["VIF"] = [variance_inflation_factor(vif_scaled, i) for i in range(vif_scaled.shape[1])]

dt.drop(columns=['Total_Income', 'CoapplicantIncome', 'Balance_Income', 'EMI'], inplace=True)
dt.drop(columns=['property_Area', 'Gender'], inplace=True)



selected_features = [
    'Credit_History', 'Semiurban', 'Rural', 'Married',
    'Education', 'Loan_Income_Ratio', 'Urban', 'LoanAmount'
]

x_test = dt[selected_features]


x_test


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(x_train_scaled, y_train)

# Scale test set using the same scaler
x_test_scaled = scaler.transform(x_test)

# Predict
y_pred_test = knn.predict(x_test_scaled)

y_pred_test


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Accuracy: 0.7886178861788617

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


Confusion Matrix:
 [[18 25]
 [ 1 79]]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt[col].fillna(dt[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt[col].fillna(dt[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [2]:
df_full_encoded=df.copy()

In [3]:
# Filter ineligible customers
ineligible_customers = df_full_encoded[df['Loan_Status'] == 0]

# Features: All except target
X_amount = ineligible_customers.drop(['LoanAmount'], axis=1)
y_amount = ineligible_customers['LoanAmount']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_amount, y_amount, test_size=0.2, random_state=42)

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Model
xgb_reg_amount = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_reg_amount.fit(X_train, y_train)

# Predict & Evaluate
y_pred_xgb = xgb_reg_amount.predict(X_val)

mae_xgb = mean_absolute_error(y_val, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))

print("Task 2 - Loan Amount Prediction with XGBoost:")
print("MAE:", mae_xgb)
print("RMSE:", rmse_xgb)



Task 2 - Loan Amount Prediction with XGBoost:
MAE: 43.30047607421875
RMSE: 66.00922301615294


In [4]:
y_pred_xgb

array([120.826775, 241.00195 , 267.56683 , 181.64255 , 142.06465 ,
       136.71585 , 129.56877 , 186.77306 ,  81.25311 , 110.00911 ,
       128.78445 , 166.46582 , 158.37274 , 132.37035 , 152.5786  ,
        84.87027 ,  48.68183 , 226.66019 , 112.58412 , 145.75159 ,
       114.75331 , 164.9535  , 111.393585, 279.78995 , 141.37137 ,
       310.23553 , 165.04703 ,  82.1329  ,  48.30867 , 154.0256  ,
       142.0559  ,  92.8568  , 161.06966 ,  72.148735, 167.11578 ,
       144.2993  , 142.47104 , 136.60638 , 305.08145 ], dtype=float32)

In [5]:
test=dt.copy()
test['Loan_Status'] = y_pred_test
test

Unnamed: 0,Loan_ID,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Rural,Semiurban,Urban,Female,Male,Loan_Income_Ratio,Loan_Status
0,1015,1,0.0,0,0.0,5720,110.0,360.0,1.0,0,0,1,0,1,0.019231,1
1,1022,1,1.0,0,0.0,3076,126.0,360.0,1.0,0,0,1,0,1,0.027535,1
2,1031,1,2.0,0,0.0,5000,208.0,360.0,1.0,0,0,1,0,1,0.030588,1
3,1035,1,2.0,0,0.0,2340,100.0,360.0,1.0,0,0,1,0,1,0.020467,1
4,1051,0,0.0,1,0.0,3276,78.0,360.0,1.0,0,0,1,0,1,0.023810,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,2971,1,3.0,1,1.0,4009,113.0,360.0,1.0,0,0,1,0,1,0.019530,1
363,2975,1,0.0,0,0.0,4158,115.0,360.0,1.0,0,0,1,0,1,0.023629,1
364,2980,0,0.0,0,0.0,3250,126.0,360.0,1.0,0,1,0,0,1,0.024032,1
365,2986,1,0.0,0,0.0,5000,158.0,360.0,1.0,1,0,0,0,1,0.021372,1


In [6]:
# Select only ineligible customers (Loan_Status == 0) in test set
ineligible_test = test[test['Loan_Status'] == 0]

# Drop target column to get features only
test_x = ineligible_test.drop(['LoanAmount'], axis=1)

# OPTIONAL: If you have true LoanAmount in test set for evaluation
test_y = ineligible_test['LoanAmount']

# Reorder columns to match training
test_x = test_x.reindex(columns=X_train.columns, fill_value=0)


In [8]:
xgb_reg_amount = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_reg_amount.fit(X_train, y_train)

# Predict & Evaluate
y_pred_xgb_test = xgb_reg_amount.predict(test_x)

mae_xgb = mean_absolute_error(test_y, y_pred_xgb_test)
rmse_xgb = np.sqrt(mean_squared_error(test_y, y_pred_xgb_test))

print("Task 2 - Loan Amount Prediction with XGBoost:")
print("MAE:", mae_xgb)
print("RMSE:", rmse_xgb)

Task 2 - Loan Amount Prediction with XGBoost:
MAE: 35.309359804789224
RMSE: 46.03730053436203


In [9]:
y_pred_xgb_test

array([179.61354 , 260.48126 , 140.18076 , 157.2067  , 177.5336  ,
       186.62112 , 138.0173  , 180.7237  , 195.40643 , 173.08379 ,
       235.2408  , 235.01454 , 194.22237 ,  84.30351 , 142.00818 ,
       128.13545 , 138.15723 , 207.43579 ,  71.066895, 144.83199 ,
        85.345055, 278.67834 , 116.37639 , 147.66968 , 141.31396 ,
       201.41992 , 122.22697 , 105.725464, 132.99666 , 138.91396 ,
       130.30092 , 127.86961 , 105.49923 , 153.78502 , 133.87901 ,
       130.87659 , 142.71733 , 145.45018 , 161.72691 , 125.43298 ,
       127.9738  ,  32.15314 ,  93.458115, 118.68416 , 128.22604 ,
       155.27737 , 155.61592 , 116.335945, 156.83723 , 175.17606 ,
        97.17039 , 130.94266 , 104.95014 , 167.35785 ,  69.68412 ,
       116.13285 ,  78.93451 , 115.69979 , 136.55286 , 143.87473 ],
      dtype=float32)

In [10]:
# Filter customers who are ineligible AND have requested <= 20 years duration (i.e., <= 240 months)
ineligible_duration_customers = df_full_encoded[
    (df['Loan_Status'] == 0) & (df['Loan_Amount_Term'] <= 240)
]

# Target: Loan_Amount_Term (predict the minimum duration needed)
y_duration = ineligible_duration_customers['Loan_Amount_Term']

# Features: All except 'Loan_Amount_Term' (and ideally, also 'Loan_Status' since it’s your label)
X_duration = ineligible_duration_customers.drop(['Loan_Amount_Term'], axis=1)

# Train-test split
X_train_dur, X_val_dur, y_train_dur, y_val_dur = train_test_split(X_duration, y_duration, test_size=0.2, random_state=42)



In [11]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Fit regression model
xgb_reg_duration = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_reg_duration.fit(X_train_dur, y_train_dur)

# Predict
y_pred_dur_xgb = xgb_reg_duration.predict(X_val_dur)

# Evaluate
mae_dur_xgb = mean_absolute_error(y_val_dur, y_pred_dur_xgb)
rmse_dur_xgb = np.sqrt(mean_squared_error(y_val_dur, y_pred_dur_xgb))

print("\nTask 3 - Loan Duration Adjustment with XGBoost:")
print("MAE:", mae_dur_xgb)
print("RMSE:", rmse_dur_xgb)



Task 3 - Loan Duration Adjustment with XGBoost:
MAE: 68.16198921203613
RMSE: 73.06136576635082


In [12]:
ineligible_duration_customers.head()

Unnamed: 0,Loan_ID,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Rural,Semiurban,Urban,Female,Male,Loan_Income_Ratio
62,1207,1.0,0.0,1.0,1.0,2609.0,165.0,180.0,0.0,0,1,0,0,0,1,0.027237
66,1228,0.0,0.0,1.0,0.0,3200.0,126.0,180.0,0.0,0,0,0,1,0,1,0.023102
128,1451,1.0,1.0,0.0,1.0,10513.0,160.0,180.0,0.0,0,0,0,1,0,1,0.01114
172,1586,1.0,3.0,1.0,0.0,3522.0,81.0,180.0,1.0,0,1,0,0,0,1,0.022998
202,1682,1.0,3.0,1.0,0.0,3992.0,128.0,180.0,1.0,0,0,0,1,0,1,0.032064


In [14]:
# Filter test set with same condition: ineligible + requested duration <= 20 years
ineligible_duration_test = test[
    (test['Loan_Status'] == 0) & (test['Loan_Amount_Term'] <= 240)
]

test_x_dur = ineligible_duration_test.drop(['Loan_Amount_Term'], axis=1)
test_y_dur = ineligible_duration_test['Loan_Amount_Term']

# Match column order
test_x_dur = test_x_dur.reindex(columns=X_train_dur.columns, fill_value=0)

# Predict and evaluate
y_pred_dur_test = xgb_reg_duration.predict(test_x_dur)
mae_dur_test = mean_absolute_error(test_y_dur, y_pred_dur_test)
rmse_dur_test = np.sqrt(mean_squared_error(test_y_dur, y_pred_dur_test))
r2_dur_test = r2_score(test_y_dur, y_pred_dur_test)

print("\nTask 3 (Test) - Loan Duration Adjustment:")
print("MAE:", mae_dur_test)
print("RMSE:", rmse_dur_test)
# print("R² Score:", r2_dur_test)



Task 3 (Test) - Loan Duration Adjustment:
MAE: 74.27544555664062
RMSE: 81.12561118302827
