In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("training_set.csv")
df['Loan_ID'] = df['Loan_ID'].str.replace("LP", "", regex=True).astype(int)
df_encoded = pd.get_dummies(df['property_Area'], prefix='property_Area')
df = pd.concat([df, df_encoded], axis=1)
df.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)
df_encoded_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, df_encoded_gender], axis=1)
df.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area', 'Loan_Status','Rural','Urban','Semiurban','Female','Male']

# Convert to numerical
for col in categorical_columns:
    df[col] = df[col].astype('category').cat.codes
df.replace(-1, np.nan, inplace=True)

for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
df.isna().sum()

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Loan_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Balance_Income'] = df['Total_Income'] - df['EMI']

numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Loan_Income_Ratio', 'EMI', 'Balance_Income']


from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
vif_features = df[numerical_features].dropna()

# Standardize features for VIF
scaler = StandardScaler()
vif_scaled = scaler.fit_transform(vif_features)

# Calculating VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = vif_features.columns
vif_data["VIF"] = [variance_inflation_factor(vif_scaled, i) for i in range(vif_scaled.shape[1])]


df.drop(columns=['Total_Income', 'CoapplicantIncome', 'Balance_Income', 'EMI'], inplace=True)
df.drop(columns=['property_Area', 'Gender'], inplace=True)
corr2 = df.corr()

target_corr2 = corr2['Loan_Status'].dropna().sort_values(ascending=False)
target_corr2 = target_corr2[abs(target_corr2) > 0.03].sort_values(key=abs, ascending=False)

selected_features = [
    'Credit_History', 'Semiurban', 'Rural', 'Married',
    'Education', 'Loan_Income_Ratio', 'Urban', 'LoanAmount'
]

x = df[selected_features]
y = df['Loan_Status']

x
y


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int8

In [43]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)


In [44]:
x_train

Unnamed: 0,Credit_History,Semiurban,Rural,Married,Education,Loan_Income_Ratio,Urban,LoanAmount
83,1.0,1,0,1.0,0.0,0.032121,0,265.0
90,1.0,1,0,1.0,0.0,0.022363,0,131.0
227,1.0,1,0,1.0,0.0,0.026432,0,210.0
482,1.0,1,0,1.0,0.0,0.024460,0,128.0
464,0.0,1,0,0.0,0.0,0.023524,0,98.0
...,...,...,...,...,...,...,...,...
71,1.0,1,0,1.0,1.0,0.025867,0,97.0
106,1.0,0,0,1.0,0.0,0.017938,1,225.0
270,1.0,0,0,0.0,0.0,0.009268,1,30.0
435,1.0,1,0,1.0,0.0,0.012740,0,128.0


In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=90, random_state=42)
rf.fit(x_train_scaled, y_train)

# Predict
y_pred = rf.predict(x_val_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


Accuracy: 0.7560975609756098

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.53      0.61        43
           1       0.78      0.88      0.82        80

    accuracy                           0.76       123
   macro avg       0.74      0.70      0.71       123
weighted avg       0.75      0.76      0.75       123


Confusion Matrix:
 [[23 20]
 [10 70]]


In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dt = pd.read_csv("testing_set.csv")

dt['Loan_ID'] = dt['Loan_ID'].str.replace("LP", "", regex=True).astype(int)

dt_encoded = pd.get_dummies(dt['property_Area'], prefix='property_Area')
dt = pd.concat([dt, dt_encoded], axis=1)
dt.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)

dt_encoded_gender = pd.get_dummies(dt['Gender'], prefix='Gender')
dt = pd.concat([dt, dt_encoded_gender], axis=1)
dt.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)

categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area','Rural','Urban','Semiurban','Female','Male']

# Convert to numerical
for col in categorical_columns:
    dt[col] = dt[col].astype('category').cat.codes
dt.replace(-1, np.nan, inplace=True)

for col in categorical_columns:
    dt[col].fillna(dt[col].mode()[0], inplace=True)

dt.isna().sum()

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    dt[col].fillna(dt[col].median(), inplace=True)

dt['Credit_History'].fillna(dt['Credit_History'].mode()[0], inplace=True)

dt['Total_Income'] = dt['ApplicantIncome'] + dt['CoapplicantIncome']
dt['Loan_Income_Ratio'] = dt['LoanAmount'] / dt['Total_Income']
dt['EMI'] = dt['LoanAmount'] / dt['Loan_Amount_Term']
dt['Balance_Income'] = dt['Total_Income'] - dt['EMI']

numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Total_Income', 'Loan_Income_Ratio', 'EMI', 'Balance_Income']

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

vif_features = dt[numerical_features].dropna()

# Standardize features for VIF
scaler = StandardScaler()
vif_scaled = scaler.fit_transform(vif_features)

# Calculating VIF
vif_data = pd.DataFrame()
vif_data["Feature"] = vif_features.columns
vif_data["VIF"] = [variance_inflation_factor(vif_scaled, i) for i in range(vif_scaled.shape[1])]

dt.drop(columns=['Total_Income', 'CoapplicantIncome', 'Balance_Income', 'EMI'], inplace=True)
dt.drop(columns=['property_Area', 'Gender'], inplace=True)



selected_features = [
    'Credit_History', 'Semiurban', 'Rural', 'Married',
    'Education', 'Loan_Income_Ratio', 'Urban', 'LoanAmount'
]

x_test = dt[selected_features]


x_test



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt[col].fillna(dt[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt[col].fillna(dt[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Unnamed: 0,Credit_History,Semiurban,Rural,Married,Education,Loan_Income_Ratio,Urban,LoanAmount
0,1.0,0,0,1,0,0.019231,1,110.0
1,1.0,0,0,1,0,0.027535,1,126.0
2,1.0,0,0,1,0,0.030588,1,208.0
3,1.0,0,0,1,0,0.020467,1,100.0
4,1.0,0,0,0,1,0.023810,1,78.0
...,...,...,...,...,...,...,...,...
362,1.0,0,0,1,1,0.019530,1,113.0
363,1.0,0,0,1,0,0.023629,1,115.0
364,1.0,1,0,0,0,0.024032,0,126.0
365,1.0,0,1,1,0,0.021372,0,158.0


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_scaled, y_train)

# Predict
y_pred_test = rf.predict(x_test_scaled)

y_pred_test

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,