**Reasoning**:
Import pandas, load the dataset, and display the first 5 rows and the info.



In [1]:
import pandas as pd

df = pd.read_csv('/content/loan_approval_dataset.csv')
display(df.head())
display(df.info())

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


None

In [5]:
categorical_cols = [' education', ' self_employed']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

X = df_encoded.drop(' loan_status', axis=1)
y = df_encoded[' loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (3415, 12)
Shape of X_test: (854, 12)
Shape of y_train: (3415,)
Shape of y_test: (854,)


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train_encoded)

y_pred_encoded = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9789227166276346


In [8]:
def calculate_max_loan(income_annum, residential_assets_value, commercial_assets_value, luxury_assets_value, bank_asset_value):
    """Calculates the maximum safe loan amount based on income and asset values."""
    total_assets = residential_assets_value + commercial_assets_value + luxury_assets_value + bank_asset_value
    # A simple rule: max loan is a multiple of income plus a fraction of total assets
    max_loan = (income_annum * 3) + (total_assets * 0.5)
    return max_loan

In [13]:
import shap

def predict_loan_status_with_explainability(applicant_details):
    """
    Predicts loan approval status, calculates max loan if approved, and explains
    rejections using SHAP values with more understandable context.

    Args:
        applicant_details (dict): A dictionary containing the applicant's details.

    Returns:
        str: A message indicating approval status, max loan, or rejection with
             top negative factors and their values.
    """
    # Create a DataFrame from applicant details, ensuring column order
    applicant_df = pd.DataFrame([applicant_details])
    applicant_df = applicant_df[X_train.columns] # Ensure column order matches training data

    # Predict loan status
    applicant_pred_encoded = xgb_model.predict(applicant_df)
    applicant_pred_label = le.inverse_transform(applicant_pred_encoded)[0]

    if applicant_pred_label == ' Approved':
        # Calculate maximum safe loan amount if approved
        max_loan = calculate_max_loan(
            applicant_details[' income_annum'],
            applicant_details[' residential_assets_value'],
            applicant_details[' commercial_assets_value'],
            applicant_details[' luxury_assets_value'],
            applicant_details[' bank_asset_value']
        )
        return f"Loan Approved! Maximum safe loan amount: ${max_loan:,.2f}"
    else:
        # Calculate SHAP values for rejection explanation
        explainer = shap.TreeExplainer(xgb_model)
        # SHAP values for the prediction of the rejected class
        shap_values = explainer.shap_values(applicant_df)[:, list(le.classes_).index(' Rejected')]

        # Identify the top 3 features with the highest SHAP values (contributing most to rejection)
        feature_importances = pd.Series(shap_values[0], index=applicant_df.columns)
        top_negative_factors = feature_importances.sort_values(ascending=False).head(3)

        rejection_message = f"Loan Rejected. Top 3 factors contributing to the rejection:\n"
        for feature, shap_value in top_negative_factors.items():
            applicant_value = applicant_details.get(feature, 'N/A') # Get applicant's value for the feature
            rejection_message += f"- {feature}: Applicant's value = {applicant_value}, SHAP value = {shap_value:.4f}\n"


        return rejection_message

In [14]:
# Sample applicant 1 (likely to be approved)
new_applicant_1 = {
    'loan_id': 9999,
    ' no_of_dependents': 2,
    ' income_annum': 10000000,
    ' loan_amount': 20000000,
    ' loan_term': 15,
    ' cibil_score': 750,
    ' residential_assets_value': 5000000,
    ' commercial_assets_value': 8000000,
    ' luxury_assets_value': 15000000,
    ' bank_asset_value': 6000000,
    ' education_ Not Graduate': False,  # Corrected key
    ' self_employed_ Yes': False       # Corrected key
}

result_1 = predict_loan_status_with_explainability(new_applicant_1)
print("Applicant 1 Result:")
print(result_1)
print("-" * 30)

# Sample applicant 2 (likely to be rejected)
new_applicant_2 = {
    'loan_id': 8888,
    ' no_of_dependents': 4,
    ' income_annum': 2000000,
    ' loan_amount': 15000000,
    ' loan_term': 20,
    ' cibil_score': 500,
    ' residential_assets_value': 1000000,
    ' commercial_assets_value': 500000,
    ' luxury_assets_value': 2000000,
    ' bank_asset_value': 100000,
    ' education_ Not Graduate': True,   # Corrected key
    ' self_employed_ Yes': True        # Corrected key
}

result_2 = predict_loan_status_with_explainability(new_applicant_2)
print("Applicant 2 Result:")
print(result_2)
print("-" * 30)

Applicant 1 Result:
Loan Approved! Maximum safe loan amount: $47,000,000.00
------------------------------
Applicant 2 Result:
Loan Rejected. Top 3 factors contributing to the rejection:
- loan_id: Applicant's value = 8888, SHAP value = -0.0677
-  no_of_dependents: Applicant's value = 4, SHAP value = -0.0677
-  income_annum: Applicant's value = 2000000, SHAP value = -0.0677

------------------------------
