In [1]:
# 1. INSTALLATION AND IMPORTS

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import sqlite3
import joblib
import warnings
warnings.filterwarnings('ignore')

# Initialize MLflow
mlflow.set_experiment("CreditPathAI_Loan_Default_Prediction")


<Experiment: artifact_location='file:///C:/Users/keyur/Desktop/Infosys%20Internship/mlruns/585164079611077161', creation_time=1763211325602, experiment_id='585164079611077161', last_update_time=1763211325602, lifecycle_stage='active', name='CreditPathAI_Loan_Default_Prediction', tags={}>

In [2]:
# 2. DATA INGESTION
def load_data(file_path):
    """Load and prepare the loan default dataset from a CSV file"""
    try:
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully!")
        print(f"Shape: {df.shape}")
        print(df.head())
        return df
    except FileNotFoundError:
        print("File not found.")
    except Exception as e:
        print(f"Error loading CSV: {e}")

file_path = "Loan_default.csv"
df = load_data(file_path)

Dataset loaded successfully!
Shape: (255347, 18)
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes 

In [3]:
# 3. EXPLORATORY DATA ANALYSIS
def perform_eda(df):
    """Quick EDA without visualizations"""
    print("=== EXPLORATORY DATA ANALYSIS ===")
    print("\nDataset Info:")
    print(df.info())
    print("\nStatistical Summary:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nTarget Distribution:")
    print(df['Default'].value_counts())

perform_eda(df)

=== EXPLORATORY DATA ANALYSIS ===

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSi

In [4]:
# 4. FEATURE ENGINEERING
def feature_engineering(df):
    """Feature engineering and preprocessing"""
    print("=== FEATURE ENGINEERING ===")
    
    df_processed = df.copy()
    df_processed['LoanToIncome'] = df_processed['LoanAmount'] / df_processed['Income']
    df_processed['MonthlyPayment'] = (
        (df_processed['LoanAmount'] *
         (df_processed['InterestRate']/100/12) *
         (1 + df_processed['InterestRate']/100/12)**df_processed['LoanTerm']) /
        ((1 + df_processed['InterestRate']/100/12)**df_processed['LoanTerm'] - 1)
    )
    df_processed['PaymentToIncome'] = df_processed['MonthlyPayment'] / (df_processed['Income']/12)
    df_processed['CreditUtilization'] = df_processed['LoanAmount'] / df_processed['CreditScore']
    
    categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 
                        'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
    
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
        label_encoders[col] = le
    
    feature_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                    'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 
                    'LoanToIncome', 'MonthlyPayment', 'PaymentToIncome', 'CreditUtilization'] + \
                   [col + '_encoded' for col in categorical_cols]
    
    X = df_processed[feature_cols]
    y = df_processed['Default']
    
    print(f"Features: {X.shape[1]} | Records: {X.shape[0]}")
    return X, y, df_processed, label_encoders

X, y, df_processed, label_encoders = feature_engineering(df)

joblib.dump(label_encoders, "models/label_encoders.pkl")
print("Saved label_encoders.pkl")

joblib.dump(list(X.columns), "models/feature_columns.pkl")
print("Saved feature_columns.pkl")

=== FEATURE ENGINEERING ===
Features: 20 | Records: 255347
Saved label_encoders.pkl
Saved feature_columns.pkl


In [5]:
# 5. DATA PREPROCESSING
def preprocess_data(X, y):
    """Split and scale data"""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Data preprocessed successfully!")
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

X_train, X_test, y_train, y_test, scaler = preprocess_data(X, y)

joblib.dump(scaler, "models/scaler.pkl")
print("Scaler saved as scaler.pkl")

Data preprocessed successfully!
Scaler saved as scaler.pkl


In [6]:
# 6. MODEL TRAINING AND EVALUATION (NO ROC CURVES)
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n=== {model_name} RESULTS ===")
    print(f"AUC-ROC Score: {auc_roc:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    return auc_roc

models = {}
results = {}

# Logistic Regression
with mlflow.start_run(run_name="Logistic_Regression"):
    lr_model = LogisticRegression(random_state=42, class_weight="balanced")
    lr_model.fit(X_train, y_train)
    models['Logistic Regression'] = lr_model
    auc = evaluate_model(lr_model, X_test, y_test, "Logistic Regression")
    results['Logistic Regression'] = auc
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("auc_roc", auc)
    mlflow.sklearn.log_model(lr_model, name="logistic_regression_model")

    joblib.dump(lr_model, "models/logistic_regression_model.pkl")
    print("Saved logistic_regression_model.pkl")

# XGBoost
with mlflow.start_run(run_name="XGBoost"):
    xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1, class_weight="balanced")
    xgb_model.fit(X_train, y_train)
    models['XGBoost'] = xgb_model
    auc = evaluate_model(xgb_model, X_test, y_test, "XGBoost")
    results['XGBoost'] = auc
    mlflow.log_param("model", "XGBoost")
    mlflow.log_metric("auc_roc", auc)
    mlflow.xgboost.log_model(xgb_model, name="xgboost_model")

    joblib.dump(xgb_model, "models/xgboost_model.pkl")
    print("Saved xgboost_model.pkl")

# LightGBM
with mlflow.start_run(run_name="LightGBM"):
    lgb_model = lgb.LGBMClassifier(random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1, class_weight="balanced")
    lgb_model.fit(X_train, y_train)
    models['LightGBM'] = lgb_model
    auc = evaluate_model(lgb_model, X_test, y_test, "LightGBM")
    results['LightGBM'] = auc
    mlflow.log_param("model", "LightGBM")
    mlflow.log_metric("auc_roc", auc)
    mlflow.lightgbm.log_model(lgb_model, name="lightgbm_model")

    joblib.dump(lgb_model, "models/lightgbm_model.pkl")
    print("Saved lightgbm_model.pkl")



=== Logistic Regression RESULTS ===
AUC-ROC Score: 0.7594
Confusion Matrix:
[[46690 21019]
 [ 2731  6165]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.69      0.80     67709
           1       0.23      0.69      0.34      8896

    accuracy                           0.69     76605
   macro avg       0.59      0.69      0.57     76605
weighted avg       0.86      0.69      0.74     76605





Saved logistic_regression_model.pkl

=== XGBoost RESULTS ===
AUC-ROC Score: 0.7556
Confusion Matrix:
[[67273   436]
 [ 8263   633]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     67709
           1       0.59      0.07      0.13      8896

    accuracy                           0.89     76605
   macro avg       0.74      0.53      0.53     76605
weighted avg       0.86      0.89      0.84     76605





Saved xgboost_model.pkl
[LightGBM] [Info] Number of positive: 20757, number of negative: 157985
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2335
[LightGBM] [Info] Number of data points in the train set: 178742, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

=== LightGBM RESULTS ===
AUC-ROC Score: 0.7570
Confusion Matrix:
[[47586 20123]
 [ 2901  5995]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.70      0.81     67709
           1       0.23      0.67      0.34      8896

    accuracy                           0.70     76605
   macro avg       0.59      0.69      0.57     76605
weighted avg       0.86 



Saved lightgbm_model.pkl


In [7]:
# 7. MODEL COMPARISON (TEXT ONLY)
print("\n=== MODEL COMPARISON ===")
model_comparison = pd.DataFrame({'Model': results.keys(), 'AUC-ROC': results.values()}).sort_values('AUC-ROC', ascending=False)
print(model_comparison)


=== MODEL COMPARISON ===
                 Model   AUC-ROC
0  Logistic Regression  0.759395
2             LightGBM  0.757038
1              XGBoost  0.755599


In [8]:
# 9. RECOMMENDATIONS (INPUT BASED)
def generate_recommendations(model, scaler, sample_data):
    """Generate risk-based recovery recommendations"""
    scaled = scaler.transform(sample_data.reshape(1, -1))
    prob = model.predict_proba(scaled)[0, 1]
    if prob < 0.3:
        level, action = "Low Risk", "Monitor regularly"
    elif prob < 0.7:
        level, action = "Medium Risk", "Contact borrower, offer payment plan options"
    else:
        level, action = "High Risk", "Escalate to collections, consider restructuring"
    return prob, level, action

def get_recommendations_by_loanid(df, X, model, scaler):
    """Ask user for LoanID and generate recommendations"""
    loan_id = input("\nEnter LoanID to generate recommendations: ").strip()
    if loan_id not in df['LoanID'].astype(str).values:
        print("LoanID not found.")
        return
    idx = df[df['LoanID'].astype(str) == loan_id].index[0]
    data = X.iloc[idx].values
    prob, level, action = generate_recommendations(model, scaler, data)
    print(f"\n=== RECOMMENDATION FOR LoanID: {loan_id} ===")
    print(f"Default Probability: {prob:.2%}")
    print(f"Risk Level: {level}")
    print(f"Recommended Action: {action}")

best_model = models[model_comparison.iloc[0]['Model']]
get_recommendations_by_loanid(df, X, best_model, scaler)


Enter LoanID to generate recommendations:  42SRSHU039



=== RECOMMENDATION FOR LoanID: 42SRSHU039 ===
Default Probability: 89.24%
Risk Level: High Risk
Recommended Action: Escalate to collections, consider restructuring


In [9]:
import pandas as pd
import joblib
import numpy as np

# ---- Load model & preprocessing artifacts ----
model = joblib.load("models/logistic_regression_model.pkl")
scaler = joblib.load("models/scaler.pkl")
label_encoders = joblib.load("models/label_encoders.pkl")
feature_columns = joblib.load("models/feature_columns.pkl")

print("Loaded model, scaler, encoders, and feature list successfully!")

# ---- Collect ALL user inputs ----
data = {}

data["Age"] = int(input("Age: "))
data["Income"] = float(input("Annual Income: "))
data["LoanAmount"] = float(input("Loan Amount: "))
data["CreditScore"] = float(input("Credit Score: "))
data["MonthsEmployed"] = int(input("Months Employed: "))
data["NumCreditLines"] = int(input("Number of Credit Lines: "))
data["InterestRate"] = float(input("Interest Rate (%): "))
data["LoanTerm"] = int(input("Loan Term (months): "))
data["DTIRatio"] = float(input("Debt-to-Income Ratio: "))

# ---- Categorical Inputs ----
data["Education"] = input("Education (Bachelor's/Master's/etc): ")
data["EmploymentType"] = input("Employment Type (Full-time/Part-time/etc): ")
data["MaritalStatus"] = input("Marital Status (Married/Single/etc): ")
data["HasMortgage"] = input("Has Mortgage (Yes/No): ")
data["HasDependents"] = input("Has Dependents (Yes/No): ")
data["LoanPurpose"] = input("Loan Purpose (Home/Other/etc): ")
data["HasCoSigner"] = input("Has Co-Signer (Yes/No): ")

df_input = pd.DataFrame([data])

# ---- Feature Engineering (Same as Training) ----
df_input["LoanToIncome"] = df_input["LoanAmount"] / df_input["Income"]

df_input["MonthlyPayment"] = (
    (df_input["LoanAmount"] * 
     (df_input["InterestRate"] / 100 / 12) *
     (1 + df_input["InterestRate"] / 100 / 12) ** df_input["LoanTerm"]) /
    ((1 + df_input["InterestRate"] / 100 / 12) ** df_input["LoanTerm"] - 1)
)

df_input["PaymentToIncome"] = df_input["MonthlyPayment"] / (df_input["Income"] / 12)
df_input["CreditUtilization"] = df_input["LoanAmount"] / df_input["CreditScore"]

# ---- Apply Label Encoding ----
for col, encoder in label_encoders.items():
    df_input[col + "_encoded"] = encoder.transform(df_input[col])

# ---- Build full input frame with ALL features ----
final_input = pd.DataFrame(columns=feature_columns)
final_input.loc[0] = 0
final_input.update(df_input)

# ---- Scale input ----
scaled_input = scaler.transform(final_input)

# ---- Predict ----
pred = model.predict(scaled_input)[0]
prob = model.predict_proba(scaled_input)[0][1]

# ---- Result ----
print("\n==== FINAL PREDICTION ====")
print("Default Probability:", round(prob, 4))

if pred == 1:
    print("ðŸ”´ HIGH DEFAULT RISK")
else:
    print("ðŸŸ¢ LOW DEFAULT RISK")


Loaded model, scaler, encoders, and feature list successfully!


Age:  24
Annual Income:  120000
Loan Amount:  400000
Credit Score:  300
Months Employed:  10
Number of Credit Lines:  3
Interest Rate (%):  15
Loan Term (months):  60
Debt-to-Income Ratio:  0.8
Education (Bachelor's/Master's/etc):  Bachelor's
Employment Type (Full-time/Part-time/etc):  Full-time
Marital Status (Married/Single/etc):  Married
Has Mortgage (Yes/No):  Yes
Has Dependents (Yes/No):  Yes
Loan Purpose (Home/Other/etc):  Other
Has Co-Signer (Yes/No):  No



==== FINAL PREDICTION ====
Default Probability: 0.8135
ðŸ”´ HIGH DEFAULT RISK
