In [277]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import numpy as np

# Loan Eligibility Prediction

This notebook demonstrates the process of predicting loan eligibility based on customer details. The steps include data preprocessing, model training, and generating predictions for the test dataset.

## Step 1: Import Necessary Libraries
```python
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import numpy as np


Step 2: Load Datasets

In [278]:
# Load datasets
train_data = pd.read_csv('Training Dataset.csv')
test_data = pd.read_csv('Test Dataset.csv')
sample_submission = pd.read_csv('Sample_Submission.csv')

# Display the first few rows of each dataset
print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

print("\nSample Submission:")
print(sample_submission.head())


Training Data:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2    

Step 3: Handle Missing Values

In [279]:
# Handle missing values in training data
train_data['Gender'] = train_data['Gender'].fillna(train_data['Gender'].mode()[0])
train_data['Married'] = train_data['Married'].fillna(train_data['Married'].mode()[0])
train_data['Dependents'] = train_data['Dependents'].fillna(train_data['Dependents'].mode()[0])
train_data['Self_Employed'] = train_data['Self_Employed'].fillna(train_data['Self_Employed'].mode()[0])
train_data['Credit_History'] = train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0])
train_data['LoanAmount'] = train_data['LoanAmount'].fillna(train_data['LoanAmount'].mean())
train_data['Loan_Amount_Term'] = train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mean())

# Handle missing values in testing data
test_data['Gender'] = test_data['Gender'].fillna(test_data['Gender'].mode()[0])
test_data['Married'] = test_data['Married'].fillna(test_data['Married'].mode()[0])
test_data['Dependents'] = test_data['Dependents'].fillna(test_data['Dependents'].mode()[0])
test_data['Self_Employed'] = test_data['Self_Employed'].fillna(test_data['Self_Employed'].mode()[0])
test_data['Credit_History'] = test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0])
test_data['LoanAmount'] = test_data['LoanAmount'].fillna(test_data['LoanAmount'].mean())
test_data['Loan_Amount_Term'] = test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mean())


Step 4: Encode Categorical Variables

In [280]:
# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)


Step 5: Split Training Data into Training and Validation Sets

In [281]:
# Split the training data into training and validation sets
X = train_data.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = train_data['Loan_Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Step 6: Scale Numerical Features

In [282]:
# Scale numerical features
scaler = StandardScaler()
X_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']] = scaler.fit_transform(X_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']])
X_val[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']] = scaler.transform(X_val[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']])
test_data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']] = scaler.transform(test_data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']])


Step 7: Train a Logistic Regression Model

In [283]:
# Train a Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_log = log_reg.predict(X_val)

# Evaluate the Logistic Regression model
accuracy_log = accuracy_score(y_val, y_val_pred_log)
classification_rep_log = classification_report(y_val, y_val_pred_log)

print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_log}")
print(f"Classification Report:\n{classification_rep_log}")


Logistic Regression Model:
Accuracy: 0.7886178861788617
Classification Report:
              precision    recall  f1-score   support

           N       0.95      0.42      0.58        43
           Y       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



Step 8: Train a Random Forest Model

In [284]:
# Train a Random Forest model
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_rf = rf_clf.predict(X_val)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
classification_rep_rf = classification_report(y_val, y_val_pred_rf)

print("\nRandom Forest Model:")
print(f"Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{classification_rep_rf}")



Random Forest Model:
Accuracy: 0.7804878048780488
Classification Report:
              precision    recall  f1-score   support

           N       0.90      0.42      0.57        43
           Y       0.76      0.97      0.85        80

    accuracy                           0.78       123
   macro avg       0.83      0.70      0.71       123
weighted avg       0.81      0.78      0.75       123



First, ENCODE THE TARGET VARIABLES, AND make sure you're using the LabelEncoder consistently:

In [285]:
from sklearn.preprocessing import LabelEncoder

# Create the LabelEncoder
loan_status_le = LabelEncoder()

# Fit the encoder on the entire training set
loan_status_le.fit(train_data['Loan_Status'])

# Transform the training data
y = loan_status_le.transform(train_data['Loan_Status'])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

When making predictions on the test set, you'll need to decode the predictions back to 'Y' and 'N':

In [286]:
# Predict on the test set
X_test = test_data.drop('Loan_ID', axis=1)
test_preds_lr = lin_reg.predict(X_test)
test_preds_lr_binary = np.round(test_preds_lr).astype(int)

# Decode predictions
test_preds_decoded = loan_status_le.inverse_transform(test_preds_lr_binary)

When training your models, use the transformed y values

In [287]:
# Train Random Forest model (or whichever model you choose as final_model)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

FOR prediction and evaluation, use the encoded values and then decode when necessary

In [288]:
# # Predict on the validation set
# y_val_pred_rf = rf_clf.predict(X_val)

# # Evaluate using encoded values
# accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
# classification_rep_rf = classification_report(y_val, y_val_pred_rf)

# print("\nRandom Forest Model:")
# print(f"Accuracy: {accuracy_rf}")
# print(f"Classification Report:\n{classification_rep_rf}")

# # If you need to see the actual 'Y' and 'N' values:
# y_val_decoded = loan_status_le.inverse_transform(y_val)
# y_val_pred_rf_decoded = loan_status_le.inverse_transform(y_val_pred_rf)

FOR THE FINAL PREDICTION ON THE TEST SET

In [289]:
# Choose the better model for final prediction
final_model = rf_clf  # or whichever model you choose

# Predict on the test set
X_test = test_data.drop('Loan_ID', axis=1)
test_preds = final_model.predict(X_test)

# Decode predictions
test_preds_decoded = loan_status_le.inverse_transform(test_preds)

# Prepare the submission file
submission = pd.DataFrame({'Loan_ID': test_data['Loan_ID'], 'Loan_Status': test_preds_decoded})
# submission.to_csv('Final_Submission.csv', index=False)

print("\nFinal Submission sample as Random forest is :")
print(submission.head())


Final Submission sample as Random forest is :
    Loan_ID Loan_Status
0  LP001015           Y
1  LP001022           Y
2  LP001031           Y
3  LP001035           Y
4  LP001051           N


In [290]:
# Predict on the test set
X_test = test_data.drop('Loan_ID', axis=1)
test_preds_lr = lin_reg.predict(X_test)
test_preds_lr_binary = np.round(test_preds_lr).astype(int)

# Decode predictions
test_preds_decoded = loan_status_le.inverse_transform(test_preds_lr_binary)

In [291]:
# Train a Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [292]:
# Choose the better model for final prediction
final_model = log_reg  # or whichever model you choose

# Use the chosen model for final predictions
X_test = test_data.drop('Loan_ID', axis=1)
test_preds = final_model.predict(X_test)

# Decode predictions
test_preds_decoded = loan_status_le.inverse_transform(test_preds)

# Prepare the submission file
submission = pd.DataFrame({'Loan_ID': test_data['Loan_ID'], 'Loan_Status': test_preds_decoded})
# submission.to_csv('Final_Submission.csv', index=False)

print("\nFinal Submission:")
print(submission.head())


Final Submission:
    Loan_ID Loan_Status
0  LP001015           Y
1  LP001022           Y
2  LP001031           Y
3  LP001035           Y
4  LP001051           Y


Step 10: Choose the Better Model and Make Final Predictions

In [293]:
# Train and evaluate Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)
y_val_pred_log = log_reg.predict(X_val)

accuracy_log = accuracy_score(y_val, y_val_pred_log)
classification_rep_log = classification_report(y_val, y_val_pred_log)

print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_log}")
print(f"Classification Report:\n{classification_rep_log}")

# Train and evaluate Random Forest model
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_val_pred_rf = rf_clf.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
classification_rep_rf = classification_report(y_val, y_val_pred_rf)

print("\nRandom Forest Model:")
print(f"Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{classification_rep_rf}")

# Compare and choose the best model
if accuracy_log > accuracy_rf:
    print("\nLogistic Regression performs better.")
    final_model = log_reg
else:
    print("\nRandom Forest performs better.")
    final_model = rf_clf

# Use the chosen model for final predictions
X_test = test_data.drop('Loan_ID', axis=1)
test_preds = final_model.predict(X_test)

# Decode predictions
test_preds_decoded = loan_status_le.inverse_transform(test_preds)

# Prepare the submission file
submission = pd.DataFrame({'Loan_ID': test_data['Loan_ID'], 'Loan_Status': test_preds_decoded})
submission.to_csv('Final_Submission.csv', index=False)

print("\nFinal Submission:")
print(submission.head())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Model:
Accuracy: 0.7886178861788617
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


Random Forest Model:
Accuracy: 0.7804878048780488
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.42      0.57        43
           1       0.76      0.97      0.85        80

    accuracy                           0.78       123
   macro avg       0.83      0.70      0.71       123
weighted avg       0.81      0.78      0.75       123


Logistic Regression performs better.

Final Submission:
    Loan_ID Loan_Status
0  LP001015           Y
1  LP001022           Y
2  LP001031           Y
3  LP001035           Y
4  LP001051     