# ðŸš€ Setup Notebook: Clone GitHub Repository

This notebook clones your team's Kaggle competition repository and sets the working directory for use in Colab.

In [None]:
# ðŸ‘¤ Author: Shared team setup
# ðŸ“¦ Clone the GitHub repository

!git clone https://github.com/remussamoila/Loan-Approval-Prediction---New-York---2025.git
%cd Loan-Approval-Prediction---New-York---2025


In [None]:
# âœ… List folders to confirm structure
import os
os.listdir()


# Loan Approval Prediction Project

## Introduction
In this project, we aim to build a machine learning model to predict loan approvals based on a variety of features from SME enterprises. We will follow a standard machine learning pipeline including data loading, EDA, cleaning, feature engineering, modeling, ensembling, and submission creation.

## Step 1: Load Data

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/train.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])
test = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/test_nolabel.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])
submission = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/sample_submission.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Submission shape:', submission.shape)

  train = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/train.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])
  train = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/train.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])
  test = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/test_nolabel.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])


Train shape: (40385, 21)
Test shape: (7050, 20)
Submission shape: (7050, 2)


  test = pd.read_csv('/content/Loan-Approval-Prediction---New-York---2025/data/test_nolabel.csv', low_memory=False, parse_dates=['ApprovalDate', 'DisbursementDate'])


## Step 2: Handle Missing Values


# Fill missing values


In [10]:
for col in train.columns:
    if train[col].isnull().sum() > 0:
        if train[col].dtype == 'object':
            train[col].fillna(train[col].mode()[0], inplace=True)
            if col in test.columns: test[col].fillna(train[col].mode()[0], inplace=True)
        else:
            train[col].fillna(train[col].median(), inplace=True)
            if col in test.columns: test[col].fillna(train[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if col in test.columns: test[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate 

## Step 3: Feature Engineering


In [11]:
for df in [train, test]:
    df['ApprovalYear'] = df['ApprovalDate'].dt.year
    df['ApprovalMonth'] = df['ApprovalDate'].dt.month
    df['ApprovalDay'] = df['ApprovalDate'].dt.day
    df['DisbursementYear'] = df['DisbursementDate'].dt.year
    df['DisbursementMonth'] = df['DisbursementDate'].dt.month
    df['DisbursementDay'] = df['DisbursementDate'].dt.day
    df['JobGrowth'] = df['CreateJob'] - df['RetainedJob']

train.drop(columns=['ApprovalDate', 'DisbursementDate'], inplace=True)
test.drop(columns=['ApprovalDate', 'DisbursementDate'], inplace=True)

## Step 4: Encode Categorical Variables


In [12]:
from sklearn.preprocessing import LabelEncoder
cat_cols = [col for col in train.columns if train[col].dtype == 'object']
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]]).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))


## Step 5: Model Training and Evaluation


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score

feature_cols = [col for col in train.columns if col not in ['id', 'Accept']]
X = train[feature_cols]
y = train['Accept'].astype(int)
X_test = test[feature_cols]

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=50),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50)
}

scores = {}
for name, model in models.items():
    score = cross_val_score(model, X, y, cv=3, scoring='f1').mean()
    scores[name] = score
    print(f'{name} mean F1: {score:.3f}')

# Ensemble using VotingClassifer
estimators = [(name, model) for name, model in models.items()]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
voting_score = cross_val_score(voting_clf, X, y, cv=3, scoring='f1').mean()
print(f'Ensemble Voting Classifier mean F1: {voting_score:.3f}')

voting_clf.fit(X, y)
predictions = voting_clf.predict(X_test)

Logistic Regression mean F1: 0.887
Decision Tree mean F1: 0.853
Random Forest mean F1: 0.907
Gradient Boosting mean F1: 0.905
Ensemble Voting Classifier mean F1: 0.900


## Step 6: Generate Submission


In [14]:
submission['Accept'] = predictions.astype(int)
submission.to_csv('final_submission_ensemble.csv', index=False)