In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Features and targets
X = train.drop(columns=[col for col in train.columns if col.startswith("BlendProperty")])
y = train[[col for col in train.columns if col.startswith("BlendProperty")]]

# Drop ID if present
if 'ID' in X.columns:
    X = X.drop(columns=['ID'])
if 'ID' in test.columns:
    test_ids = test['ID']
    X_test = test.drop(columns=['ID'])
else:
    test_ids = np.arange(len(test))
    X_test = test.copy()

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
lgbm = MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42, n_jobs=-1))
xgbm = MultiOutputRegressor(xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1))

# Fit base models
rf.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
xgbm.fit(X_train, y_train)

# Evaluate base models
for name, model in zip(["RandomForest", "LightGBM", "XGBoost"], [rf, lgbm, xgbm]):
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"{name} Validation RMSE: {rmse:.4f}")

# Stacking ensemble (stacking works with single-output models, so we wrap inside MultiOutput)
base_learners = [("rf", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
                 ("lgb", lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42, n_jobs=-1)),
                 ("xgb", xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1))]

final_estimator = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=42, n_jobs=-1)
stack = MultiOutputRegressor(StackingRegressor(estimators=base_learners, final_estimator=final_estimator, n_jobs=-1))

# Train stacking model
stack.fit(X_train, y_train)
val_preds = stack.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"Stacking Ensemble Validation RMSE: {rmse:.4f}")

# Fit on full training data
stack.fit(X, y)

# Predict on test
preds = stack.predict(X_test)

# Save submission
submission = pd.DataFrame(preds, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission.insert(0, "ID", test_ids)
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start t

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV

# ======================
# 1. Load Data
# ======================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# Target columns: all properties to predict
target_cols = [col for col in train.columns if "BlendProperty" in col]

# Drop unwanted columns
drop_cols = target_cols + ["ID", "Selection"]  # add any extra if needed
X = train.drop(columns=[c for c in drop_cols if c in train.columns])
y = train[target_cols]

X_test = test.drop(columns=[c for c in ["ID", "Selection"] if c in test.columns])

# ==========================
# 2. Feature Engineering
# ==========================
# Example: multiply fractions × properties
fractions = [c for c in X.columns if "fraction" in c.lower()]
comp_props = [c for c in X.columns if "Component" in c and "fraction" not in c.lower()]

for frac in fractions:
    for prop in comp_props:
        new_col = f"{frac}_x_{prop}"
        X[new_col] = X[frac] * X[prop]
        X_test[new_col] = X_test[frac] * X_test[prop]

# Align columns (safety check)
X_test = X_test[X.columns]

# ==========================
# 3. Cross-Validation Setup
# ==========================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "rf": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
    "gbr": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
}

stack_model = StackingRegressor(
    estimators=[("rf", models["rf"]), ("gbr", models["gbr"])],
    final_estimator=RidgeCV(),
    n_jobs=-1
)

# ==========================
# 4. Training & Validation
# ==========================
oof = np.zeros(y.shape)
preds = np.zeros((X_test.shape[0], y.shape[1]))

for i, target in enumerate(target_cols):
    print(f"\nTraining for target: {target}")
    y_target = y[target].values
    oof_target = np.zeros(len(X))
    preds_target = np.zeros(len(X_test))

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_target[train_idx], y_target[val_idx]

        model = stack_model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        oof_target[val_idx] = val_pred
        preds_target += model.predict(X_test) / kf.n_splits

        rmse = mean_squared_error(y_val, val_pred, squared=False)
        r2 = r2_score(y_val, val_pred)
        print(f"  Fold RMSE: {rmse:.4f}, R2: {r2:.4f}")

    oof[:, i] = oof_target
    preds[:, i] = preds_target

    overall_rmse = mean_squared_error(y_target, oof_target, squared=False)
    overall_r2 = r2_score(y_target, oof_target)
    print(f"Overall {target} RMSE: {overall_rmse:.4f}, R2: {overall_r2:.4f}")

# ==========================
# 5. Save Predictions
# ==========================
sub = pd.DataFrame(preds, columns=target_cols)
sub.insert(0, "ID", test["ID"])  # keep test IDs for submission
sub.to_csv("submission.csv", index=False)

print("\n✅ Training complete. Predictions saved to submission.csv")




  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[prop]
  X_test[new_col] = X_test[frac] * X_test[prop]
  X[new_col] = X[frac] * X[p


Training for target: BlendProperty1




  Fold RMSE: 0.1787, R2: 0.9653




  Fold RMSE: 0.1539, R2: 0.9746




  Fold RMSE: 0.1837, R2: 0.9632




  Fold RMSE: 0.1763, R2: 0.9716




  Fold RMSE: 0.1698, R2: 0.9728
Overall BlendProperty1 RMSE: 0.1728, R2: 0.9698

Training for target: BlendProperty2




  Fold RMSE: 0.1851, R2: 0.9627




  Fold RMSE: 0.1669, R2: 0.9727




  Fold RMSE: 0.1645, R2: 0.9697




  Fold RMSE: 0.1770, R2: 0.9718




  Fold RMSE: 0.1761, R2: 0.9714
Overall BlendProperty2 RMSE: 0.1741, R2: 0.9700

Training for target: BlendProperty3




  Fold RMSE: 0.1837, R2: 0.9648




  Fold RMSE: 0.1803, R2: 0.9690




  Fold RMSE: 0.1911, R2: 0.9628




  Fold RMSE: 0.2011, R2: 0.9603




  Fold RMSE: 0.1923, R2: 0.9620
Overall BlendProperty3 RMSE: 0.1898, R2: 0.9639

Training for target: BlendProperty4




  Fold RMSE: 0.1973, R2: 0.9605




  Fold RMSE: 0.1727, R2: 0.9708




  Fold RMSE: 0.1561, R2: 0.9742




  Fold RMSE: 0.1675, R2: 0.9725




  Fold RMSE: 0.1833, R2: 0.9696
Overall BlendProperty4 RMSE: 0.1759, R2: 0.9696

Training for target: BlendProperty5




  Fold RMSE: 0.0876, R2: 0.9928




  Fold RMSE: 0.0608, R2: 0.9960




  Fold RMSE: 0.0643, R2: 0.9960




  Fold RMSE: 0.0706, R2: 0.9950




  Fold RMSE: 0.1140, R2: 0.9844
Overall BlendProperty5 RMSE: 0.0818, R2: 0.9931

Training for target: BlendProperty6




  Fold RMSE: 0.1986, R2: 0.9601




  Fold RMSE: 0.1892, R2: 0.9659




  Fold RMSE: 0.1922, R2: 0.9604




  Fold RMSE: 0.1918, R2: 0.9670




  Fold RMSE: 0.1981, R2: 0.9602
Overall BlendProperty6 RMSE: 0.1940, R2: 0.9630

Training for target: BlendProperty7




  Fold RMSE: 0.1851, R2: 0.9642




  Fold RMSE: 0.1862, R2: 0.9662




  Fold RMSE: 0.1949, R2: 0.9609




  Fold RMSE: 0.2138, R2: 0.9572




  Fold RMSE: 0.1973, R2: 0.9598
Overall BlendProperty7 RMSE: 0.1957, R2: 0.9617

Training for target: BlendProperty8




  Fold RMSE: 0.2290, R2: 0.9456




  Fold RMSE: 0.2181, R2: 0.9531




  Fold RMSE: 0.2202, R2: 0.9480




  Fold RMSE: 0.2010, R2: 0.9618




  Fold RMSE: 0.2069, R2: 0.9575
Overall BlendProperty8 RMSE: 0.2153, R2: 0.9535

Training for target: BlendProperty9




  Fold RMSE: 0.2589, R2: 0.9364




  Fold RMSE: 0.2522, R2: 0.9396




  Fold RMSE: 0.2472, R2: 0.9358




  Fold RMSE: 0.2327, R2: 0.9420




  Fold RMSE: 0.2551, R2: 0.9345
Overall BlendProperty9 RMSE: 0.2494, R2: 0.9379

Training for target: BlendProperty10




  Fold RMSE: 0.1579, R2: 0.9756




  Fold RMSE: 0.1397, R2: 0.9801




  Fold RMSE: 0.1613, R2: 0.9713




  Fold RMSE: 0.1487, R2: 0.9767
  Fold RMSE: 0.1587, R2: 0.9756
Overall BlendProperty10 RMSE: 0.1535, R2: 0.9760

✅ Training complete. Predictions saved to submission.csv




In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score

# Define MAPE function
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

metrics_list = []

# Ensure y_val is a DataFrame
if isinstance(y_val, np.ndarray):
    y_val = pd.DataFrame(y_val, columns=target_cols)

# Loop through each target column
for i, col in enumerate(target_cols):
    y_true = y_val[col].values  # use column directly
    y_pred = val_preds[:, i]    # predicted values

    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    medae = median_absolute_error(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)

    metrics_list.append([col, r2, mae, rmse, medae, evs, mape])

# Create DataFrame of metrics
metrics_df = pd.DataFrame(metrics_list,
                          columns=["Target", "R2", "MAE", "RMSE", "MedAE", "EVS", "MAPE (%)"])

# Print averages
print("\n🔹 Average R²:", np.mean(metrics_df["R2"]).round(4))
print("🔹 Average MAE:", np.mean(metrics_df["MAE"]).round(4))
print("🔹 Average RMSE:", np.mean(metrics_df["RMSE"]).round(4))
print("🔹 Average MedAE:", np.mean(metrics_df["MedAE"]).round(4))
print("🔹 Average EVS:", np.mean(metrics_df["EVS"]).round(4))
print("🔹 Average MAPE (%):", np.mean(metrics_df["MAPE (%)"]).round(2))

# Print per-target metrics
print("\nPer Target Metrics:\n", metrics_df)



🔹 Average R²: -0.9604
🔹 Average MAE: 1.1574
🔹 Average RMSE: 1.4219
🔹 Average MedAE: 1.0162
🔹 Average EVS: -0.9563
🔹 Average MAPE (%): 992.15

Per Target Metrics:
    Target        R2       MAE      RMSE     MedAE       EVS    MAPE (%)
0  Target -0.960364  1.157371  1.421939  1.016163 -0.956338  992.150768




In [3]:
# ================================
# 1. Import Libraries
# ================================
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

# ================================
# 2. Load Data
# ================================
data_dir = Path(r"C:\Users\pushp\Downloads\dataset")

train_df = pd.read_csv(data_dir / "train.csv")
test_df = pd.read_csv(data_dir / "test.csv")
sample_df = pd.read_csv(data_dir / "sample_solution.csv")

# ================================
# 3. Feature / Target Split
# ================================
target_cols = train_df.filter(like="BlendProperty").columns.tolist()
X = train_df.drop(columns=target_cols)
y = train_df[target_cols]

X_test = test_df.drop(columns=["ID"])
test_ids = test_df["ID"]

# ================================
# 4. Train/Validation Split
# ================================
def make_split(X, y, test_size=0.2, seed=42):
    return train_test_split(X, y, test_size=test_size, random_state=seed)

X_train, X_val, y_train, y_val = make_split(X, y)

# ================================
# 5. Model Training
# ================================
regressor = LGBMRegressor(
    n_estimators=120,
    learning_rate=0.08,
    random_state=42,
    n_jobs=-1
)

model = Pipeline([
    ("multioutput", MultiOutputRegressor(regressor))
])

model.fit(X_train, y_train)

# ================================
# 6. Validation Performance
# ================================
val_preds = model.predict(X_val)

r2_list = [r2_score(y_val.iloc[:, i], val_preds[:, i]) for i in range(y.shape[1])]
mae_list = [mean_absolute_error(y_val.iloc[:, i], val_preds[:, i]) for i in range(y.shape[1])]

metrics_df = pd.DataFrame({
    "Target": target_cols,
    "R2_Score": r2_list,
    "MAE": mae_list
})

print("\n🔹 Average R²:", np.mean(r2_list).round(4))
print("🔹 Average MAE:", np.mean(mae_list).round(4))
print("\nPer Target Metrics:\n", metrics_df)

# ================================
# 7. Final Predictions & Save
# ================================
final_preds = model.predict(X_test)

submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission = submission[sample_df.columns]   # match required format

out_path = data_dir / "submission_v2.csv"
submission.to_csv(out_path, index=False)

print(f"\n✅ Submission saved at: {out_path}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001593 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start t

In [7]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
pickle.dump(target_cols, open("target_cols.pkl", "wb"))