In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install jupyter


Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.5.0-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab->jupyter)
  Downloading jupyter_lsp-2.3.0-py3-none-any.whl.metadata (1.8 kB)
Collecting jupyterlab-server<3,>=2.28.0 (from jupyterlab->jupyter)
  Downloading jupyterlab_server-2.28.0-py3-none-any.whl.metadata (5.9 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel->jupyter)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting json5>=0.9.0 (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter)
  Downloading json5-0.12.1-py3-none-any.whl.metadata (36 kB)
Downloading jupyter-1.1.1-py2.py3-none-any.whl (2.7 kB)
Downloading jupyterlab-4.5.0-py3-none-any.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
pip install optuna


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [8]:
# random_forest_simple.py

import os
import json
import time
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


# =========================================================
# CONFIG
# =========================================================

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# ---- CHANGE THIS FOR GOOGLE COLAB ----
BASE_OUTPUT_DIR = "/content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
# --------------------------------------

train_path = r"/content/drive/MyDrive/ML_Project/Dataset/train_updated.csv"
test_path  = r"/content/drive/MyDrive/ML_Project/Dataset/test_updated.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

TARGET = "RiskFlag"
ID_COL = "ProfileID"


numeric_features = [
    "ApplicantYears","AnnualEarnings","RequestedSum","TrustMetric","WorkDuration",
    "ActiveAccounts","OfferRate","RepayPeriod","DebtFactor"
]

categorical_features = [
    "QualificationLevel","WorkCategory","RelationshipStatus","FamilyObligation",
    "OwnsProperty","FundUseCase","JointApplicant"
]

# =========================================================
# PREPROCESSOR
# =========================================================
def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
        ]
    )


# =========================================================
# SAVE OUTPUTS
# =========================================================
def save_outputs(out_dir, model, preproc,
                 X_val_df, y_val, X_test_int_df, y_test_int, X_final_df):

    os.makedirs(out_dir, exist_ok=True)

    X_val = preproc.transform(X_val_df)
    X_test = preproc.transform(X_test_int_df)
    X_final = preproc.transform(X_final_df)

    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    final_pred = model.predict(X_final)

    val_acc = accuracy_score(y_val, val_pred)
    test_acc = accuracy_score(y_test_int, test_pred)

    # Validation report
    with open(os.path.join(out_dir, "classification_validation.txt"), "w") as f:
        f.write(classification_report(y_val, val_pred))
        f.write(f"\nValidation Accuracy: {val_acc}\n")

    # Test report
    with open(os.path.join(out_dir, "classification_test.txt"), "w") as f:
        f.write(classification_report(y_test_int, test_pred))
        f.write(f"\nTest Accuracy: {test_acc}\n")

    # Summary
    with open(os.path.join(out_dir, "accuracy_summary.txt"), "w") as f:
        f.write(f"Validation Accuracy: {val_acc}\n")
        f.write(f"Test Accuracy: {test_acc}\n")

    # Submission CSV
    submission = pd.DataFrame({
        "ProfileID": test_df[ID_COL],
        "RiskFlag": final_pred.astype(int)
    })
    submission.to_csv(os.path.join(out_dir, f"{os.path.basename(out_dir)}_RandomForest.csv"), index=False)


# =========================================================
# RUN SIMPLE RF
# =========================================================
def run_rf(out_dir, X_train_df, y_train, X_val_df, y_val,
           X_test_int_df, y_test_int, X_final_df, class_weight):

    preproc = make_preprocessor()
    X_train = preproc.fit_transform(X_train_df)

    # Simple RF model
    model = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight=class_weight,
        random_state=SEED,
        n_jobs=-1
    )

    print(f"Training Random Forest → {out_dir}")
    model.fit(X_train, y_train)

    save_outputs(out_dir, model, preproc,
                 X_val_df, y_val, X_test_int_df, y_test_int, X_final_df)

    print(f"Completed: {out_dir}")


# =========================================================
# MAIN SCRIPT
# =========================================================
def main():
    print("\nRunning SIMPLE Random Forest for all 4 modes...\n")

    # -----------------------------------------------------
    # SPLITS
    # -----------------------------------------------------
    train_full, test_internal = train_test_split(
        train_df, test_size=0.10, stratify=train_df[TARGET], random_state=SEED
    )

    train_80, val_10 = train_test_split(
        train_full, test_size=0.1111,
        stratify=train_full[TARGET], random_state=SEED
    )

    # 20%
    train_20_raw = train_test_split(
        train_df, train_size=0.20,
        stratify=train_df[TARGET], random_state=SEED
    )[0]

    train_20, val_20 = train_test_split(
        train_20_raw, test_size=0.1111,
        stratify=train_20_raw[TARGET], random_state=SEED
    )

    X_test_int_df = test_internal.drop([TARGET, ID_COL], axis=1)
    y_test_int = test_internal[TARGET].values

    X_final_df = test_df.drop(ID_COL, axis=1)

    # -----------------------------------------------------
    # MODES
    # -----------------------------------------------------
    modes = [
        ("80_skewed",    train_80, val_10, None),
        ("80_nonskewed", train_80, val_10, "balanced"),
        ("20_skewed",    train_20, val_20, None),
        ("20_nonskewed", train_20, val_20, "balanced"),
    ]

    for mode_name, train_split, val_split, balance in modes:

        print(f"\n==============================")
        print(f"   Running Mode: {mode_name}")
        print(f"==============================\n")

        X_train_df = train_split.drop([TARGET, ID_COL], axis=1)
        y_train = train_split[TARGET].values

        X_val_df = val_split.drop([TARGET, ID_COL], axis=1)
        y_val = val_split[TARGET].values

        # imbalance handling
        if balance == "balanced":
            pos = sum(y_train == 1)
            neg = sum(y_train == 0)
            class_weight = {"0": neg / (pos + neg), "1": pos / (pos + neg)}
            class_weight = "balanced"
        else:
            class_weight = None

        out_dir = os.path.join(BASE_OUTPUT_DIR, mode_name)

        run_rf(out_dir, X_train_df, y_train, X_val_df, y_val,
               X_test_int_df, y_test_int, X_final_df, class_weight)

    print("\nAll Random Forest Modes Completed Successfully!\n")


# Run everything
if __name__ == "__main__":
    main()



Running SIMPLE Random Forest for all 4 modes...


   Running Mode: 80_skewed

Training Random Forest → /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/80_skewed
Completed: /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/80_skewed

   Running Mode: 80_nonskewed

Training Random Forest → /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/80_nonskewed
Completed: /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/80_nonskewed

   Running Mode: 20_skewed

Training Random Forest → /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/20_skewed
Completed: /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/20_skewed

   Running Mode: 20_nonskewed

Training Random Forest → /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/20_nonskewed
Completed: /content/drive/MyDrive/ML_Project/DecisionTrees/RandomForest/20_nonskewed

All Random Forest Modes Completed Successfully!

