In [51]:
# 1. Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [52]:
# 2. Load dataset
data_file_path = pd.read_csv('/content/10000_loan_dataset_Apr24_v1.csv')
df = data_file_path.copy()


In [55]:
# 3. Drop irrelevant columns
df = df.drop(columns=['customer_id', 'loan_id'])

In [56]:
# 4. Identify columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()

In [57]:
df

Unnamed: 0,age,salary_net_income,social_security_net_income,self_employed_net_income,salary_frequency,months_current_job,total_experience_months,current_bankruptcy_status,ever_bankrupt,loan_amount,...,past_to_be_collected,past_collection_amount,collectibility_score,refinance,extended_payment_plan,settlement_plans_with_waive_off,accepted_option,promised_to_pay,promised_to_pay_status,expected_collection_amount
0,64,1909,0,323,biweekly,63.0,121.0,False,False,1201,...,0,0,73.41,False,False,False,Refinance,,,1327.14
1,74,4212,0,0,biweekly,1.0,19.0,False,False,4016,...,0,0,41.64,False,False,False,Refinance,,,805.13
2,91,588,0,0,weekly,72.0,290.0,False,False,937,...,4471,467,36.39,True,True,True,Refinance,No,Partially kept,467.00
3,42,0,4611,0,,,,False,False,4794,...,3908,541,28.25,False,True,True,Settlement_plans_with_waive_off,Yes,Partially kept,541.00
4,43,4592,0,0,monthly,85.0,85.0,False,False,1976,...,3998,3885,71.97,True,False,True,Settlement_plans_with_waive_off,Yes,Partially kept,3885.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,63,3027,0,0,weekly,21.0,308.0,False,False,4644,...,0,0,66.97,False,False,False,Refinance,,,1065.21
9996,97,4143,0,0,biweekly,76.0,819.0,False,True,2787,...,8952,560,63.73,True,False,False,Refinance,Yes,Partially kept,560.00
9997,24,0,4341,0,,,,False,False,2214,...,0,0,59.11,False,False,False,Refinance,,,1561.20
9998,24,3941,0,0,weekly,2.0,10.0,False,False,4485,...,4208,0,34.06,True,True,True,Extended_payment_plan,No,not kept,0.00


In [58]:
# 5. Preprocess numerical columns
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # force non-numeric to NaN
    df[col] = df[col].fillna(0.01)  # fill missing numericals with 0.01

In [59]:
# 6. Preprocess categorical columns
label_encoders = {}
for col in categorical_columns:
    df[col] = df[col].astype(str)  # Convert NaN to 'nan' string
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for each column


In [60]:
# 7. Split into Features (X) and Target (Y)
X = df.drop(columns=['accepted_option'])
Y = df['accepted_option']


In [61]:
# 8. Final check for NaNs
X = X.apply(pd.to_numeric, errors='coerce')
X = X.fillna(0.01)
assert X.isna().sum().sum() == 0
assert Y.isna().sum() == 0

In [62]:
# 9. Train/Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [63]:
# 10. Train Logistic Regression
model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
# 11. Evaluate the Model (Optional)
y_pred = model.predict(X_test)
print("Accuracy on Test Set:", accuracy_score(Y_test, y_pred))
print(classification_report(Y_test, y_pred, target_names=label_encoders['accepted_option'].classes_))

Accuracy on Test Set: 0.629
                                 precision    recall  f1-score   support

          Extended_payment_plan       0.39      0.18      0.24       377
                      Refinance       0.69      0.92      0.79      1235
Settlement_plans_with_waive_off       0.32      0.15      0.21       388

                       accuracy                           0.63      2000
                      macro avg       0.47      0.41      0.41      2000
                   weighted avg       0.56      0.63      0.57      2000



In [93]:
# Sample input test with one example

In [94]:
# INPUT

new_customer = {
    'age': 50,
    'salary_net_income': 2500,
    'social_security_net_income': 0,
    'self_employed_net_income': 0,
    'salary_frequency': 'monthly',  # From your dataset options
    'months_current_job': 36,
    'total_experience_months': 120,
    'current_bankruptcy_status': False,
    'ever_bankrupt': False,
    'loan_amount': 3000,
    'past_to_be_collected': 5000,
    'past_collection_amount': 0,
    'collectibility_score': 60.0,
    'refinance': True,
    'extended_payment_plan': False,
    'settlement_plans_with_waive_off': False,
    'promised_to_pay': True,
    'promised_to_pay_status': 'kept',
    'expected_collection_amount': 4500
}


In [95]:
# INPUT EVALUATION

def predict_customer(new_customer_input):

    new_customer_df = pd.DataFrame([new_customer_input])


    for col in numerical_columns:
        if col not in new_customer_df.columns:
            new_customer_df[col] = 0.01

    for col in categorical_columns:
        if col not in new_customer_df.columns:
            new_customer_df[col] = 'nan'


    new_customer_df[numerical_columns] = new_customer_df[numerical_columns].fillna(0.01)


    for col in categorical_columns:
        new_customer_df[col] = new_customer_df[col].astype(str)
        le = label_encoders[col]


        if 'nan' not in le.classes_:
            le.classes_ = np.append(le.classes_, 'nan')


        new_customer_df[col] = new_customer_df[col].apply(lambda x: x if x in le.classes_ else 'nan')
        new_customer_df[col] = le.transform(new_customer_df[col])


    new_customer_df = new_customer_df[X_train.columns]

    y_pred = model.predict(new_customer_df)

    final_label = label_encoders['accepted_option'].inverse_transform(y_pred)

    print(f"\n The best option for the customer is: {final_label[0]}")


In [96]:
# OUT PUT OF SAMPLE
predict_customer(new_customer)


 The best option for the customer is: Refinance


In [100]:

# GridSearchCV on Logistic, Decision Tree, Random Forest

In [81]:
from sklearn.metrics import classification_report

# Find out unique labels actually present in Y_test
unique_labels = np.unique(Y_test)

# Now match labels and target names properly
print(classification_report(
    Y_test,
    y_pred,
    labels=unique_labels,
    target_names=label_encoders['accepted_option'].inverse_transform(unique_labels)
))


                                 precision    recall  f1-score   support

          Extended_payment_plan       0.39      0.18      0.24       377
                      Refinance       0.69      0.92      0.79      1235
Settlement_plans_with_waive_off       0.32      0.15      0.21       388

                       accuracy                           0.63      2000
                      macro avg       0.47      0.41      0.41      2000
                   weighted avg       0.56      0.63      0.57      2000



In [82]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, y_pred)
print(f" Model Accuracy: {accuracy:.4f}")

✅ Model Accuracy: 0.6290


In [83]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [84]:
# 1. Define models
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', max_iter=1000, solver='lbfgs'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}


In [85]:
# 2. Define parameter grids
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'newton-cg'],
        'max_iter': [500, 1000]
    },
    'Decision Tree': {
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5]
    }
}

In [86]:
# 3. Train models using GridSearchCV
best_models = {}

for name, model in models.items():
    print(f"\n🔍 Running GridSearchCV for {name}...")
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X_train, Y_train)

    print(f"✅ Best Cross-Validation Accuracy for {name}: {grid.best_score_:.4f}")
    print(f"✅ Best Hyperparameters: {grid.best_params_}")



🔍 Running GridSearchCV for Logistic Regression...
Fitting 5 folds for each of 16 candidates, totalling 80 fits




✅ Best Cross-Validation Accuracy for Logistic Regression: 0.8314
✅ Best Hyperparameters: {'C': 0.1, 'max_iter': 500, 'solver': 'newton-cg'}

🔍 Running GridSearchCV for Decision Tree...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
✅ Best Cross-Validation Accuracy for Decision Tree: 0.8291
✅ Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 2}

🔍 Running GridSearchCV for Random Forest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
✅ Best Cross-Validation Accuracy for Random Forest: 0.8340
✅ Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [89]:
    # Save the best model
    best_models[name] = grid.best_estimator_

In [91]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    print(f"\n✅ {name} Accuracy on Test Set: {acc:.4f}")



✅ Random Forest Accuracy on Test Set: 0.8180


In [92]:
    unique_labels = np.unique(Y_test)
    print(classification_report(
        Y_test,
        y_pred,
        labels=unique_labels,
        target_names=label_encoders['accepted_option'].inverse_transform(unique_labels)
    ))

                                 precision    recall  f1-score   support

          Extended_payment_plan       0.71      0.59      0.65       377
                      Refinance       0.86      0.93      0.90      1235
Settlement_plans_with_waive_off       0.74      0.67      0.70       388

                       accuracy                           0.82      2000
                      macro avg       0.77      0.73      0.75      2000
                   weighted avg       0.81      0.82      0.81      2000

