<a href="https://colab.research.google.com/github/remussamoila/Loan-Approval-Prediction---New-York---2025/blob/main/Final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# 📦 Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE

# 📂 Load the datasets
train = pd.read_csv('train.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
test = pd.read_csv('test_nolabel.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
submission = pd.read_csv('sample_submission.csv')

# 💵 Convert currency columns to numeric
for col in ['DisbursementGross', 'BalanceGross']:
    train[col] = train[col].astype(str).str.replace(r'[$,]', '', regex=True).astype(float)
    test[col] = test[col].astype(str).str.replace(r'[$,]', '', regex=True).astype(float)

# 🧼 Handle missing values
for df in [train, test]:
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)

# 🚨 Clip outliers in numeric columns
for col in train.select_dtypes(include='number').columns:
    lower, upper = train[col].quantile(0.01), train[col].quantile(0.99)
    train[col] = np.clip(train[col], lower, upper)
    if col in test.columns:
        test[col] = np.clip(test[col], lower, upper)

# 🛠️ Feature engineering: extract date parts and compute job growth
for df in [train, test]:
    if 'ApprovalDate' in df:
        df['ApprovalYear'] = df['ApprovalDate'].dt.year
        df['ApprovalMonth'] = df['ApprovalDate'].dt.month
    if 'DisbursementDate' in df:
        df['DisbursementYear'] = df['DisbursementDate'].dt.year
        df['DisbursementMonth'] = df['DisbursementDate'].dt.month
    df['JobGrowth'] = df.get('CreateJob', 0) - df.get('RetainedJob', 0)
    df.drop(['ApprovalDate', 'DisbursementDate'], axis=1, inplace=True, errors='ignore')

# 🔤 Encode categorical variables
for col in train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# 🧪 Prepare training and test sets
# These lines define X_train and X_test and were likely skipped in the previous execution.
exclude_cols = ['id', 'Accept', 'LoanNr_ChkDgt', 'Name', 'City', 'Bank']
feature_cols = [col for col in train.columns if col not in exclude_cols]
X_train = train[feature_cols]
y_train = train['Accept'].astype(int)
X_test = test[feature_cols]


# 📏 Scale numeric features
scaler = StandardScaler()

# Determine numeric columns from the training features (excluding the target)
numeric_cols_train = X_train.select_dtypes(include='number').columns
X_train[numeric_cols_train] = scaler.fit_transform(X_train[numeric_cols_train])

# Determine numeric columns from the test features
numeric_cols_test = X_test.select_dtypes(include='number').columns
# Apply the same scaler fitted on the training data to the test features
X_test[numeric_cols_test] = scaler.transform(X_test[numeric_cols_test])

# ⚖️ Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# 🤖 Define and train models
# DecisionTreeClassifier, RandomForestClassifier, and GradientBoostingClassifier were already imported
dt = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
rf = RandomForestClassifier(n_estimators=150, max_depth=10, class_weight='balanced')
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.05)

# 🧠 Ensemble with VotingClassifier
ensemble = VotingClassifier(estimators=[('dt', dt), ('rf', rf), ('gb', gb)], voting='soft')
ensemble.fit(X_res, y_res)

# 📤 Generate predictions and save submission
submission['Accept'] = ensemble.predict(X_test).astype(int)
submission.to_csv('final_submission.csv', index=False)

  train = pd.read_csv('train.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
  train = pd.read_csv('train.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
  train = pd.read_csv('train.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
  test = pd.read_csv('test_nolabel.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
  test = pd.read_csv('test_nolabel.csv', parse_dates=['ApprovalDate', 'DisbursementDate'])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting