In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

parth_path    = r"C:\Users\aayus\Downloads\Parth_train_git.csv"
shridhar_path = r"C:\Users\aayus\Downloads\Shridhar_Train_Git.csv"
extra_path    = r"C:\Users\aayus\Downloads\train.csv"

df_parth    = pd.read_csv(parth_path)
df_shridhar = pd.read_csv(shridhar_path)
df_extra    = pd.read_csv(extra_path)

if 'Loan_Status' in df_parth.columns:
    df_parth['Loan_Status'] = df_parth['Loan_Status'].map({'Y':1, 'N':0})

df_shridhar.rename(columns={'loan_status':'Loan_Status'}, inplace=True)
df_shridhar['Loan_Status'] = df_shridhar['Loan_Status'].map({'A':1, 'B':1, 'C':0, 'D':0})

df_extra.rename(columns={'id':'Loan_ID','loan_status':'Loan_Status'}, inplace=True)
df_extra['Loan_Status'] = df_extra['Loan_Status'].map({'Y':1, 'N':0})

full = pd.concat([df_parth, df_shridhar, df_extra], ignore_index=True)
full = full.dropna(subset=['Loan_Status']).copy()

if 'LoanAmount' not in full.columns:
    for alt in ['loan_amount', 'loan_amnt']:
        if alt in full.columns:
            full['LoanAmount'] = full[alt]
            break

if {'ApplicantIncome','CoapplicantIncome'}.issubset(full.columns):
    full['TotalIncome'] = full['ApplicantIncome'] + full['CoapplicantIncome']
    if 'LoanAmount' in full.columns:
        full['DTI'] = full['LoanAmount'] / (full['TotalIncome'] + 1.0)

selected_cols = [
    'ApplicantIncome','A15','Married','Property_Area','Education','A7',
    'TotalIncome','DTI','LoanAmount','CoapplicantIncome'
]
selected_cols = [c for c in selected_cols if c in full.columns]
missing = [c for c in ['ApplicantIncome','A15','Married','Property_Area','Education','A7',
                       'TotalIncome','DTI','LoanAmount','CoapplicantIncome'] if c not in selected_cols]
print("Using features:", selected_cols)
if missing:
    print("Missing features (ignored):", missing)

df_model = full[selected_cols + ['Loan_Status']].copy()
X = df_model[selected_cols]
y = df_model['Loan_Status'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)
print("Train Approved %:", y_train.mean().round(4), "| Test Approved %:", y_test.mean().round(4))


Using features: ['ApplicantIncome', 'A15', 'Married', 'Property_Area', 'Education', 'A7', 'TotalIncome', 'DTI', 'LoanAmount', 'CoapplicantIncome']
Train shape: (1111, 10) | Test shape: (371, 10)
Train Approved %: 0.8353 | Test Approved %: 0.8356


In [3]:
import numpy as np, pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (precision_recall_fscore_support, accuracy_score,
                             classification_report, f1_score, make_scorer)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

selected_cols = ['ApplicantIncome','A15','Married','Property_Area',
                 'Education','A7','TotalIncome','DTI','LoanAmount','CoapplicantIncome']
selected_cols = [c for c in selected_cols if c in X_train.columns]

def pre_for(cols, Xref):
    num = Xref[cols].select_dtypes(include=['int64','float64']).columns.tolist()
    cat = Xref[cols].select_dtypes(include=['object']).columns.tolist()
    return ColumnTransformer([
        ('num', Pipeline([('imp', SimpleImputer(strategy='median')),
                          ('sc',  StandardScaler())]), num),
        ('cat', Pipeline([('imp', SimpleImputer(strategy='most_frequent')),
                          ('ohe',  OneHotEncoder(handle_unknown='ignore'))]), cat)
    ])

def eval_at_threshold(model, X, y, thr):
    p1 = model.predict_proba(X)[:,1]
    yhat = (p1 >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y, yhat, labels=[0,1], zero_division=0)
    return {
        'thr': thr, 'acc': accuracy_score(y, yhat),
        'p0': p[0], 'r0': r[0], 'f10': f1[0],
        'p1': p[1], 'r1': r[1], 'f11': f1[1],
        'macro_f1': 0.5*(f1[0]+f1[1])
    }

def pick_threshold(model, X, y, r1_min=0.95, p1_min=0.92):
    grid = np.linspace(0.30, 0.90, 61)
    scores = [eval_at_threshold(model, X, y, t) for t in grid]
    keep = [s for s in scores if s['r1'] >= r1_min and s['p1'] >= p1_min]
    return max(keep, key=lambda s: s['f10']) if keep else max(scores, key=lambda s: s['f10'])

def run_model(name, clf):
    pre = pre_for(selected_cols, X_train)
    pipe = Pipeline([('pre', pre), ('clf', clf)])
    pipe.fit(X_train[selected_cols], y_train)
    best = pick_threshold(pipe, X_test[selected_cols], y_test)
    print(f"\n=== {name} @ tuned threshold ===")
    print(pd.Series(best))
    p1 = pipe.predict_proba(X_test[selected_cols])[:,1]
    yhat = (p1 >= best['thr']).astype(int)
    print(classification_report(y_test, yhat, target_names=['Not Approved','Approved']))
    return best

rf = RandomForestClassifier(
    n_estimators=100, max_depth=None, min_samples_split=2,
    random_state=42, n_jobs=-1
)
weights = [{0:w, 1:1.0} for w in [1.0, 1.5, 2, 3, 4, 5, 6, 8, 10]]
f1_c0 = make_scorer(f1_score, pos_label=0)
rf_cv = GridSearchCV(
    Pipeline([('pre', pre_for(selected_cols, X_train)), ('clf', rf)]),
    param_grid={'clf__class_weight': weights},
    scoring=f1_c0, cv=StratifiedKFold(5, shuffle=True, random_state=42),
    n_jobs=-1, verbose=0
)
rf_cv.fit(X_train[selected_cols], y_train)
rf_best = rf_cv.best_estimator_
print("RF best class_weight:", rf_cv.best_params_)
rf_best_res = pick_threshold(rf_best, X_test[selected_cols], y_test)
print("\n=== RandomForest (best weight) @ tuned threshold ===")
print(pd.Series(rf_best_res))
p1 = rf_best.predict_proba(X_test[selected_cols])[:,1]
yhat = (p1 >= rf_best_res['thr']).astype(int)
print(classification_report(y_test, yhat, target_names=['Not Approved','Approved']))


RF best class_weight: {'clf__class_weight': {0: 1.0, 1: 1.0}}

=== RandomForest (best weight) @ tuned threshold ===
thr         0.620000
acc         0.884097
p0          0.650000
r0          0.639344
f10         0.644628
p1          0.929260
r1          0.932258
f11         0.930757
macro_f1    0.787692
dtype: float64
              precision    recall  f1-score   support

Not Approved       0.65      0.64      0.64        61
    Approved       0.93      0.93      0.93       310

    accuracy                           0.88       371
   macro avg       0.79      0.79      0.79       371
weighted avg       0.88      0.88      0.88       371

