<a href="https://colab.research.google.com/github/safwanahmadsaffi/Shell.ai-Hackathon-2025/blob/main/ml-model-training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Feature engineering function
def add_extra_features(df):
    # identify numeric features excluding ID and target columns
    features = [col for col in df.columns if col not in ['ID'] and not col.startswith('BlendProperty')]
    df['feature_sum'] = df[features].sum(axis=1)
    df['feature_mean'] = df[features].mean(axis=1)
    df['feature_std'] = df[features].std(axis=1)
    return df

# Load the datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("/content/test.csv")

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Train columns: {list(train_df.columns)}")
print(f"Test columns: {list(test_df.columns)}")

# Separate features (X) and target variables (y)
X_train = train_df.drop([f"BlendProperty{i}" for i in range(1, 11)], axis=1)
y_train = train_df[[f"BlendProperty{i}" for i in range(1, 11)]]

# For the test set, drop the 'ID' column
X_test = test_df.drop("ID", axis=1)

# Add extra statistical features to improve model
X_train = add_extra_features(X_train)
X_test = add_extra_features(X_test)

# Initialize models for ensemble
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
knn_model = KNeighborsRegressor(n_neighbors=7)

# Prepare container for predictions
predictions = np.zeros((X_test.shape[0], y_train.shape[1]))

# Cross-validation setup
gkf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, col in enumerate(y_train.columns):
    print(f"Training and validating model for {col}")
    cv_scores = []
    for train_idx, val_idx in gkf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[col].iloc[train_idx], y_train[col].iloc[val_idx]
        gb_model.fit(X_tr, y_tr)
        preds = gb_model.predict(X_val)
        cv_scores.append(mean_absolute_percentage_error(y_val, preds))
    print(f"{col} CV MAPE: {np.mean(cv_scores):.5f}")
    # Train on full data
    gb_model.fit(X_train, y_train[col])
    knn_model.fit(X_train, y_train[col])
    preds_gb = gb_model.predict(X_test)
    preds_knn = knn_model.predict(X_test)
    predictions[:, i] = (preds_gb + preds_knn) / 2

# Create and save submission
submission_df = pd.DataFrame(predictions, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission_df.to_csv("submission.csv", index=False)
print("Ensembled model trained and predictions saved to submission.csv")



In [4]:
# STEP 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error

# STEP 2: Load the datasets
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# STEP 3: Separate features and targets
# Drop 'ID' from test set and BlendProperty columns from train set
X_train = train.drop([f"BlendProperty{i}" for i in range(1, 11)], axis=1)
y_train = train[[f"BlendProperty{i}" for i in range(1, 11)]]
X_test = test.drop("ID", axis=1)


# STEP 4: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# STEP 5: Split train data for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# STEP 6: Train Random Forest Model
rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
rf_model.fit(X_tr, y_tr)

# STEP 7: Validate on validation set
y_val_pred = rf_model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Validation MAPE: {mape:.4f}")

# STEP 8: Predict on test set
y_test_pred = rf_model.predict(X_test_scaled)

# STEP 9: Create submission DataFrame
submission = pd.DataFrame(y_test_pred, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission.insert(0, 'ID', test['ID'])  # Ensure 'ID' is the first column

# STEP 10: Save to CSV
submission.to_csv("random_forest_submission.csv", index=False)
print("✅ Submission file saved as 'random_forest_submission.csv'")

Validation MAPE: 4.2471
✅ Submission file saved as 'random_forest_submission.csv'


In [None]:
# 1. Libraries
import pandas as pd, numpy as np, hashlib, optuna, lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

# 2. Load
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# 3. Basic split
X_base_train = train.iloc[:, :55].copy()
y_train      = train.iloc[:, 55:].copy()
X_base_test  = test.iloc[:, :55].copy() # Keep ID for now to merge later


# 4. === Feature Engineering =================================================
def add_features(df):
    out = df.copy()

    # component fraction columns = first 5
    frac_cols = [f"Component{i}_fraction" for i in range(1, 6)]


    # --- weighted component-property: (fraction_i * Component_i_Property_j)
    for comp in range(1, 6):          # Components 1..5
        frac_col = f"Component{comp}_fraction"
        if frac_col in df.columns: # Ensure the fraction column exists
            for prop in range(1, 11):     # Properties 1..10
                prop_col = f"Component{comp}_Property{prop}"
                new_col  = f"W_{prop_col}"
                # Check if the property column exists in the current DataFrame before using it
                if prop_col in df.columns:
                    out[new_col] = df[frac_col] * df[prop_col]
                else:
                    # If property column is missing, create the weighted column with zeros
                    out[new_col] = 0


    # --- simple pairwise fraction interactions
    for i in range(1, 6):
        for j in range(i+1, 6):
            frac_i_col = f"Component{i}_fraction"
            frac_j_col = f"Component{j}_fraction"
            if frac_i_col in df.columns and frac_j_col in df.columns:
                 out[f"Frac_{i}_{j}"] = df[frac_i_col] * df[frac_j_col]
            else:
                out[f"Frac_{i}_{j}"] = 0


    return out

# Drop 'ID' from test features before adding features
X_train = add_features(X_base_train)
X_test  = add_features(X_base_test.drop("ID", axis=1))


# 5. Scaling numeric columns (LightGBM doesn't need it, but interactions benefit)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Ensure X_test has the same columns as X_train before scaling
# Add missing columns to X_test and fill with 0
missing_cols_in_test = set(X_train.columns) - set(X_test.columns)
for c in missing_cols_in_test:
    X_test[c] = 0
# Reorder columns to match X_train
X_test = X_test[X_train.columns]


X_test_scaled  = scaler.transform(X_test)

# Re‑wrap as DataFrame for convenience
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test  = pd.DataFrame(X_test_scaled,  columns=X_test.columns)

# 6. GroupKFold for robust CV  (group by rounded fractions hash)
def hash_frac(row, precision=2):
    # Ensure we only use the fraction columns for hashing
    frac_values = row[[f"Component{i}_fraction" for i in range(1, 6)]]
    key = tuple(np.round(frac_values, precision))
    return int(hashlib.md5(str(key).encode()).hexdigest(), 16) % 10_000_000

# Apply hash_frac to the original X_base_train to get groups
groups = X_base_train.apply(hash_frac, axis=1)


gkf = GroupKFold(n_splits=5)

# 7. LightGBM + Optuna tuner for ONE target; wrap in a function
def tune_and_train(X, y, target_name):
    def objective(trial):
        params = {
            "objective": "rmse",
            "metric": "mae",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
            "num_leaves": trial.suggest_int("leaves", 31, 1023, log=True),
            "feature_fraction": trial.suggest_float("feat_frac", 0.5, 1.0),
            "bagging_fraction": trial.suggest_float("bag_frac", 0.5, 1.0),
            "bagging_freq": 1,
            "min_data_in_leaf": trial.suggest_int("min_leaf", 20, 200),
            "lambda_l1": trial.suggest_float("l1", 0.0, 5.0),
            "lambda_l2": trial.suggest_float("l2", 0.0, 5.0),
        }
        mape_scores = []
        for train_idx, val_idx in gkf.split(X, y, groups):
            lgb_train = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
            lgb_val   = lgb.Dataset(X.iloc[val_idx],  y.iloc[val_idx])
            # Update lgb.train call to use callbacks for early stopping
            model = lgb.train(params, lgb_train,
                              valid_sets=[lgb_val],
                              num_boost_round=500,
                              callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]) # Use callbacks
            preds = model.predict(X.iloc[val_idx])
            mape_scores.append(mean_absolute_percentage_error(y.iloc[val_idx], preds))
        return np.mean(mape_scores)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20, show_progress_bar=False)

    best_params = study.best_trial.params
    best_params.update({"objective": "rmse", "metric": "mae", "verbosity": -1})
    # Update final lgb.train call to use callbacks for early stopping if needed,
    # but since we train on the full data here, early stopping isn't typically used
    # and the num_boost_round is taken from the best iteration from Optuna.
    # If Optuna's best iteration is not available, default to 500 rounds.
    final_model = lgb.train(best_params, lgb.Dataset(X, y),
                            num_boost_round=study.best_trial.user_attrs.get("best_iteration", 500))
    return final_model, study.best_value

# 8. Train one model per target
models, val_mapes = {}, {}
for target in y_train.columns:
    model, best_mape = tune_and_train(X_train, y_train[target], target)
    models[target] = model
    val_mapes[target] = best_mape
    print(f"{target}: CV‑MAPE {best_mape:.4f}")

print(f"\nMean CV‑MAPE over all targets: {np.mean(list(val_mapes.values())):.4f}")

# 9. Predict on test
test_preds = pd.DataFrame({t: m.predict(X_test) for t, m in models.items()})

# 10. Build submission
submission = test_preds.copy()
submission.insert(0, "ID", test["ID"])
submission.to_csv("lightgbm_optuna_submission.csv", index=False)
print("\n✅  Submission saved as 'lightgbm_optuna_submission.csv'")

In [None]:
%pip install optuna

In [None]:
#!/usr/bin/env python
# blend_prediction.py
# Shell.ai Hackathon – Fuel‑Blend Properties Prediction
# Author: Safwan (stock‑gpt) – July 2025

import os, hashlib, warnings, argparse
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
from catboost import CatBoostRegressor
import optuna, joblib, json, random
import sys

warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# ------------------------------------------------------------------------------
# 1. CLI args
# ------------------------------------------------------------------------------
parser = argparse.ArgumentParser()
parser.add_argument("--train", default="train.csv")
parser.add_argument("--test",  default="test.csv")
parser.add_argument("--out",   default="submission.csv")
parser.add_argument("--trials", type=int, default=60, help="Optuna trials per target")
parser.add_argument("--lgb_weight", type=float, default=0.6, help="Blend weight for LightGBM")

# Check if running in an interactive environment like Colab and skip argparse if no explicit args
if not any(arg.startswith('--') for arg in sys.argv[1:]):
    args = argparse.Namespace(train='train.csv', test='test.csv', out='submission.csv', trials=60, lgb_weight=0.6)
else:
    args = parser.parse_args()


# ------------------------------------------------------------------------------
# 2. Load
# ------------------------------------------------------------------------------
train = pd.read_csv(args.train)
test  = pd.read_csv(args.test)

X_base_train = train.iloc[:, :55].copy()
y_train      = train.iloc[:, 55:].copy()
X_base_test  = test.iloc[:, :55].copy()

# ------------------------------------------------------------------------------
# 3. Feature engineering
# ------------------------------------------------------------------------------
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # weighted component‑property
    for comp in range(1, 6):
        f_col = f"Component{comp}_fraction"
        if f_col not in df: continue
        for prop in range(1, 11):
            p_col = f"Component{comp}_Property{prop}"
            new_c = f"W_Comp{comp}_Prop{prop}"
            out[new_c] = df[f_col] * df.get(p_col, 0)
    # pairwise fraction interactions
    for i in range(1, 6):
        for j in range(i+1, 6):
            fi, fj = f"Component{i}_fraction", f"Component{j}_fraction"
            out[f"Frac_{i}_{j}"] = df.get(fi, 0) * df.get(fj, 0)
    return out

X_train = add_features(X_base_train)
X_test  = add_features(X_base_test)

# ensure same columns order
missing = set(X_train.columns) - set(X_test.columns)
for c in missing: X_test[c] = 0
X_test = X_test[X_train.columns]

# ------------------------------------------------------------------------------
# 4. Scaling
# ------------------------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test  = pd.DataFrame(X_test_scaled,  columns=X_test.columns)

# ------------------------------------------------------------------------------
# 5. GroupKFold groups (hash of fractions)
# ------------------------------------------------------------------------------
def hash_frac(row, prec=2):
    key = tuple(np.round(row[[f"Component{i}_fraction" for i in range(1,6)]], prec))
    return int(hashlib.md5(str(key).encode()).hexdigest(),16)%10_000_000

groups = X_base_train.apply(hash_frac, axis=1)
gkf = GroupKFold(n_splits=5)

# ------------------------------------------------------------------------------
# 6. LightGBM + Optuna tuning (bag 5 folds)
# ------------------------------------------------------------------------------
# Check for GPU availability using CatBoost's method for consistency
has_gpu = CatBoostRegressor().get_param("task_type")=="GPU" if os.getenv("CUDA_VISIBLE_DEVICES") else False
lgb_device = "gpu" if has_gpu else "cpu"
print(f"🔧 Using LightGBM on {lgb_device.upper()}")


lgb_models = {t: [] for t in y_train.columns}
val_mapes  = {}
study_db   = optuna.storages.InMemoryStorage()

def objective_factory(X, y):
    def obj(trial):
        params = {
            "objective": "rmse",
            "metric": "mae",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "device": lgb_device, # Use the determined device
            "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
            "num_leaves": trial.suggest_int("leaves", 31, 1023, log=True),
            "feature_fraction": trial.suggest_float("feat_frac", 0.5, 1.0),
            "bagging_fraction": trial.suggest_float("bag_frac", 0.5, 1.0),
            "bagging_freq": 1,
            "min_data_in_leaf": trial.suggest_int("min_leaf", 20, 200),
            "lambda_l1": trial.suggest_float("l1", 0.0, 5.0),
            "lambda_l2": trial.suggest_float("l2", 0.0, 5.0),
            "seed": SEED,
        }
        mape_scores=[]
        for tr, vl in gkf.split(X, y, groups):
            m = lgb.train(params, lgb.Dataset(X.iloc[tr], y.iloc[tr]),
                          num_boost_round=2000,
                          valid_sets=[lgb.Dataset(X.iloc[vl], y.iloc[vl])],
                          callbacks=[lgb.early_stopping(100, verbose=False)])
            pr = m.predict(X.iloc[vl])
            mape_scores.append(mean_absolute_percentage_error(y.iloc[vl], pr))
        return np.mean(mape_scores)
    return obj

for tgt in y_train.columns:
    print(f"\n🔎 Tuning LightGBM for {tgt} …")
    study = optuna.create_study(direction="minimize", storage=study_db, sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective_factory(X_train, y_train[tgt]), n_trials=args.trials, show_progress_bar=False)
    best = study.best_trial.params
    best.update({"objective":"rmse","metric":"mae","verbosity":-1,"device":lgb_device,"seed":SEED}) # Use the determined device
    fold_mapes=[]
    for fold,(tr,vl) in enumerate(gkf.split(X_train, y_train[tgt], groups)):
        mdl = lgb.train(best, lgb.Dataset(X_train.iloc[tr], y_train[tgt].iloc[tr]),
                        num_boost_round=study.best_trial.user_attrs.get("best_iteration", 1000))
        lgb_models[tgt].append(mdl)
        pr = mdl.predict(X_train.iloc[vl])
        fold_mapes.append(mean_absolute_percentage_error(y_train[tgt].iloc[vl], pr))
    val_mapes[tgt] = np.mean(fold_mapes)
    print(f"📊 {tgt} CV‑MAPE {val_mapes[tgt]:.4f}")

print(f"\n📈 Mean CV‑MAPE: {np.mean(list(val_mapes.values())):.4f}")

# ------------------------------------------------------------------------------
# 7. CatBoost quick model (no tuning, GPU if available)
# ------------------------------------------------------------------------------
cat_models = {}
has_gpu = CatBoostRegressor().get_param("task_type")=="GPU" if os.getenv("CUDA_VISIBLE_DEVICES") else False
cat_params = dict(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="MAE",
    task_type="GPU" if has_gpu else "CPU",
    verbose=False,
    random_seed=SEED,
)

print(f"\n🚂 Training CatBoost ({'GPU' if has_gpu else 'CPU'}) …")
for tgt in y_train.columns:
    cat = CatBoostRegressor(**cat_params)
    cat.fit(X_train, y_train[tgt])
    cat_models[tgt] = cat

# ------------------------------------------------------------------------------
# 8. Predict & blend
# ------------------------------------------------------------------------------
preds_lgb = {}
for tgt, mdl_list in lgb_models.items():
    fold_preds = np.mean([m.predict(X_test) for m in mdl_list], axis=0)
    preds_lgb[tgt] = fold_preds

preds_cat = {tgt: mdl.predict(X_test) for tgt, mdl in cat_models.items()}

alpha = args.lgb_weight
blend_preds = {tgt: alpha*preds_lgb[tgt] + (1-alpha)*preds_cat[tgt] for tgt in y_train.columns}

submission = pd.DataFrame(blend_preds)
submission.insert(0, "ID", test["ID"])
submission.to_csv(args.out, index=False)

print(f"\n✅ Submission saved to “{args.out}”")
print("    You can now upload it to the leaderboard.\n")

[I 2025-07-07 07:33:35,683] A new study created in memory with name: no-name-370ac956-1970-44ab-ba21-8b2ad71baf96


🔧 Using LightGBM on CPU

🔎 Tuning LightGBM for BlendProperty1 …


[I 2025-07-07 07:34:04,545] Trial 0 finished with value: 1.9369180722543067 and parameters: {'lr': 0.030710573677773714, 'leaves': 861, 'feat_frac': 0.8659969709057025, 'bag_frac': 0.7993292420985183, 'min_leaf': 48, 'l1': 0.7799726016810132, 'l2': 0.2904180608409973}. Best is trial 0 with value: 1.9369180722543067.
[I 2025-07-07 07:34:08,232] Trial 1 finished with value: 8.452601442474162 and parameters: {'lr': 0.13394334706750485, 'leaves': 252, 'feat_frac': 0.8540362888980227, 'bag_frac': 0.5102922471479012, 'min_leaf': 195, 'l1': 4.162213204002109, 'l2': 1.0616955533913808}. Best is trial 0 with value: 1.9369180722543067.
[I 2025-07-07 07:34:34,978] Trial 2 finished with value: 1.5734134665240262 and parameters: {'lr': 0.017240892195821537, 'leaves': 58, 'feat_frac': 0.6521211214797689, 'bag_frac': 0.762378215816119, 'min_leaf': 98, 'l1': 1.4561457009902097, 'l2': 3.0592644736118975}. Best is trial 2 with value: 1.5734134665240262.
[I 2025-07-07 07:34:53,123] Trial 3 finished with 

In [17]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
