In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pip install numpy pandas scikit-learn scikit-optimize xgboost scipy

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [11]:
# xgboost_simple_splits.py

import os
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# ==========================================================
# CONFIG
# ==========================================================
SEED = 42
np.random.seed(SEED)

# -------- CHANGE THIS FOR GOOGLE COLAB --------
BASE_OUTPUT_DIR = "/content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
# -----------------------------------------------

train_path = r"/content/drive/MyDrive/ML_Project/Dataset/train_updated.csv"
test_path  = r"/content/drive/MyDrive/ML_Project/Dataset/test_updated.csv"

TARGET = "RiskFlag"
ID_COL = "ProfileID"

numeric_features = [
    "ApplicantYears","AnnualEarnings","RequestedSum","TrustMetric","WorkDuration",
    "ActiveAccounts","OfferRate","RepayPeriod","DebtFactor"
]

categorical_features = [
    "QualificationLevel","WorkCategory","RelationshipStatus","FamilyObligation",
    "OwnsProperty","FundUseCase","JointApplicant"
]

# ==========================================================
# LOAD DATA
# ==========================================================
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# ==========================================================
# PREPROCESSOR
# ==========================================================
def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
        ]
    )


# ==========================================================
# SAVE REPORTS & SUBMISSION
# ==========================================================
def save_outputs(model, preproc, out_dir,
                 X_val, y_val, X_test_int, y_test_int, X_final):

    os.makedirs(out_dir, exist_ok=True)

    # Transform data
    X_val_p = preproc.transform(X_val)
    X_test_p = preproc.transform(X_test_int)
    X_final_p = preproc.transform(X_final)

    # Predictions
    val_pred = model.predict(X_val_p)
    test_pred = model.predict(X_test_p)
    final_pred = model.predict(X_final_p)

    # Accuracies
    val_acc = accuracy_score(y_val, val_pred)
    test_acc = accuracy_score(y_test_int, test_pred)

    # Validation Report
    with open(os.path.join(out_dir, "classification_validation.txt"), "w") as f:
        f.write(classification_report(y_val, val_pred))
        f.write(f"\nValidation Accuracy: {val_acc}\n")

    # Test Report
    with open(os.path.join(out_dir, "classification_test.txt"), "w") as f:
        f.write(classification_report(y_test_int, test_pred))
        f.write(f"\nTest Accuracy: {test_acc}\n")

    # Accuracy Summary
    with open(os.path.join(out_dir, "accuracy_summary.txt"), "w") as f:
        f.write(f"Validation Accuracy: {val_acc}\n")
        f.write(f"Test Accuracy: {test_acc}\n")

    # Submission CSV
    submission = pd.DataFrame({
        "ProfileID": test_df[ID_COL],
        "RiskFlag": final_pred.astype(int)
    })
    submission.to_csv(os.path.join(out_dir, f"{os.path.basename(out_dir)}_XGBoost.csv"), index=False)


# ==========================================================
# SIMPLE XGBOOST TRAINING
# ==========================================================
def run_simple(out_dir, X_train, y_train, X_val, y_val,
               X_test_int, y_test_int, X_final, scale_pos_wt):

    preproc = make_preprocessor()
    X_train_p = preproc.fit_transform(X_train)

    model = XGBClassifier(
        n_estimators=400,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        min_child_weight=1,
        gamma=0,
        random_state=SEED,
        eval_metric="logloss",
        scale_pos_weight=scale_pos_wt
    )

    print(f"Training simple XGBoost → {out_dir}")

    model.fit(X_train_p, y_train)

    save_outputs(model, preproc, out_dir,
                 X_val, y_val, X_test_int, y_test_int, X_final)

    print(f"Completed {out_dir}\n")


# ==========================================================
# MAIN SCRIPT
# ==========================================================
def main():

    # -----------------------------
    # Dataset Splits
    # -----------------------------
    train_full, test_internal = train_test_split(
        train_df, test_size=0.10, stratify=train_df[TARGET], random_state=SEED
    )

    train_80, val_10 = train_test_split(
        train_full, test_size=0.1111,
        stratify=train_full[TARGET], random_state=SEED
    )

    train_20_raw = train_test_split(
        train_df, train_size=0.20,
        stratify=train_df[TARGET], random_state=SEED
    )[0]

    train_20, val_20 = train_test_split(
        train_20_raw, test_size=0.1111,
        stratify=train_20_raw[TARGET], random_state=SEED
    )

    # Prepare test data
    X_test_int = test_internal.drop([TARGET, ID_COL], axis=1)
    y_test_int = test_internal[TARGET].values
    X_final = test_df.drop(ID_COL, axis=1)

    # -----------------------------
    # Modes
    # -----------------------------
    modes = [
        ("80_skewed",      train_80, val_10, None),
        ("80_nonskewed",   train_80, val_10, "balanced"),
        ("20_skewed",      train_20, val_20, None),
        ("20_nonskewed",   train_20, val_20, "balanced")
    ]

    for mode_name, train_split, val_split, balance in modes:

        print(f"\n==============================")
        print(f"   Running: {mode_name}")
        print(f"==============================\n")

        X_train = train_split.drop([TARGET, ID_COL], axis=1)
        y_train = train_split[TARGET].values

        X_val = val_split.drop([TARGET, ID_COL], axis=1)
        y_val = val_split[TARGET].values

        # Imbalance handling
        if balance == "balanced":
            pos = sum(y_train == 1)
            neg = sum(y_train == 0)
            spw = neg / pos
        else:
            spw = 1

        # Output directory INSIDE Google Drive
        out_dir = os.path.join(BASE_OUTPUT_DIR, mode_name)

        run_simple(out_dir, X_train, y_train, X_val, y_val,
                   X_test_int, y_test_int, X_final, spw)

    print("\nAll SIMPLE XGBoost Modes Completed Successfully!\n")


# Run everything
if __name__ == "__main__":
    main()



   Running: 80_skewed

Training simple XGBoost → /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/80_skewed
Completed /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/80_skewed


   Running: 80_nonskewed

Training simple XGBoost → /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/80_nonskewed
Completed /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/80_nonskewed


   Running: 20_skewed

Training simple XGBoost → /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/20_skewed
Completed /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/20_skewed


   Running: 20_nonskewed

Training simple XGBoost → /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/20_nonskewed
Completed /content/drive/MyDrive/ML_Project/DecisionTrees/XGBoost/20_nonskewed


All SIMPLE XGBoost Modes Completed Successfully!

