# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Define features
features = ['RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 
            'InstrumentalScore', 'LivePerformanceLikelihood', 'MoodScore', 
            'TrackDurationMs', 'Energy']
X = train[features].copy()
y = train['BeatsPerMinute']
X_test = test[features].copy()

# Feature engineering: Log-transform and add Energy_Rhythm
X['AudioLoudness'] = np.log1p(-X['AudioLoudness'])
X_test['AudioLoudness'] = np.log1p(-X_test['AudioLoudness'])
X['TrackDurationMs'] = np.log1p(X['TrackDurationMs'])
X_test['TrackDurationMs'] = np.log1p(X_test['TrackDurationMs'])
X['Energy_Rhythm'] = X['Energy'] * X['RhythmScore']
X_test['Energy_Rhythm'] = X_test['Energy'] * X_test['RhythmScore']

# Split data (80/20 for holdout and meta-training)
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_holdout_scaled = scaler.transform(X_holdout)
X_test_scaled = scaler.transform(X_test)

# Train Base Models
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_full_scaled, y_train_full)
lr_holdout_pred = lr.predict(X_holdout_scaled)
lr_rmse = np.sqrt(mean_squared_error(y_holdout, lr_holdout_pred))
print(f"Linear RMSE: {lr_rmse:.4f}")

# Random Forest (your best params)
rf = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1)
rf.fit(X_train_full_scaled, y_train_full)
rf_holdout_pred = rf.predict(X_holdout_scaled)
rf_rmse = np.sqrt(mean_squared_error(y_holdout, rf_holdout_pred))
print(f"RF RMSE: {rf_rmse:.4f}")

# XGBoost with proven params
xgb_model = xgb.XGBRegressor(n_estimators=130, learning_rate=0.06, max_depth=6, random_state=42)
xgb_model.fit(X_train_full_scaled, y_train_full)
xgb_holdout_pred = xgb_model.predict(X_holdout_scaled)
xgb_rmse = np.sqrt(mean_squared_error(y_holdout, xgb_holdout_pred))
print(f"XGBoost RMSE: {xgb_rmse:.4f}")

# LightGBM with enhanced role
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.03, max_depth=6, random_state=42, n_jobs=-1)
lgb_model.fit(X_train_full_scaled, y_train_full)
lgb_holdout_pred = lgb_model.predict(X_holdout_scaled)
lgb_rmse = np.sqrt(mean_squared_error(y_holdout, lgb_holdout_pred))
print(f"LightGBM RMSE: {lgb_rmse:.4f}")

# Generate meta-features (predictions) for stacking
kf = KFold(n_splits=5, shuffle=True, random_state=42)
meta_X_train = np.zeros((len(X_train_full_scaled), 4))
meta_X_holdout = np.zeros((len(X_holdout_scaled), 4))
meta_X_test = np.zeros((len(X_test_scaled), 4))

for train_idx, val_idx in kf.split(X_train_full_scaled):
    X_train_fold, X_val_fold = X_train_full_scaled[train_idx], X_train_full_scaled[val_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    
    lr.fit(X_train_fold, y_train_fold)
    rf.fit(X_train_fold, y_train_fold)
    xgb_model.fit(X_train_fold, y_train_fold)
    lgb_model.fit(X_train_fold, y_train_fold)
    
    meta_X_train[val_idx] = np.column_stack((
        lr.predict(X_val_fold),
        rf.predict(X_val_fold),
        xgb_model.predict(X_val_fold),
        lgb_model.predict(X_val_fold)
    ))
meta_X_holdout = np.column_stack((
    lr.predict(X_holdout_scaled),
    rf.predict(X_holdout_scaled),
    xgb_model.predict(X_holdout_scaled),
    lgb_model.predict(X_holdout_scaled)
))
meta_X_test = np.column_stack((
    lr.predict(X_test_scaled),
    rf.predict(X_test_scaled),
    xgb_model.predict(X_test_scaled),
    lgb_model.predict(X_test_scaled)
))

# Train Meta-Model (Ridge Regression)
meta_model = Ridge(alpha=1.0, random_state=42)
meta_model.fit(meta_X_train, y_train_full)
meta_holdout_pred = meta_model.predict(meta_X_holdout)
meta_rmse = np.sqrt(mean_squared_error(y_holdout, meta_holdout_pred))
print(f"Meta-Model Holdout RMSE: {meta_rmse:.4f}")

# Generate final test predictions
test_pred = meta_model.predict(meta_X_test)
submission['BeatsPerMinute'] = test_pred
submission.to_csv('submission_ensemble_stacking_ridge.csv', index=False)
print("Submission file created: submission_ensemble_stacking_ridge.csv")

Linear RMSE: 26.4441
RF RMSE: 26.4408
XGBoost RMSE: 26.4520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 10
[LightGBM] [Info] Start training from score 119.056554




LightGBM RMSE: 26.4416
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 335464, number of used features: 10
[LightGBM] [Info] Start training from score 119.062610




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001938 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 335465, number of used features: 10
[LightGBM] [Info] Start training from score 119.054608




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 335465, number of used features: 10
[LightGBM] [Info] Start training from score 119.039780




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 335465, number of used features: 10
[LightGBM] [Info] Start training from score 119.078083




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 335465, number of used features: 10
[LightGBM] [Info] Start training from score 119.047690




Meta-Model Holdout RMSE: 26.4390
Submission file created: submission_ensemble_stacking_ridge.csv
