# Import libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Define features
features = ['RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 
            'InstrumentalScore', 'LivePerformanceLikelihood', 'MoodScore', 
            'TrackDurationMs', 'Energy']
X = train[features]
y = train['BeatsPerMinute']
X_test = test[features]

# Feature engineering: Log-transform skewed features
X['AudioLoudness'] = np.log1p(-X['AudioLoudness'])
X_test['AudioLoudness'] = np.log1p(-X_test['AudioLoudness'])
X['TrackDurationMs'] = np.log1p(X['TrackDurationMs'])
X_test['TrackDurationMs'] = np.log1p(X_test['TrackDurationMs'])

# Split data (80/20 for holdout validation)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)
X_test_scaled = scaler.transform(X_test)

# Train Individual Models
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_holdout_pred = lr.predict(X_holdout_scaled)
lr_rmse = np.sqrt(mean_squared_error(y_holdout, lr_holdout_pred))
print(f"Linear RMSE: {lr_rmse:.4f}")

# Random Forest (your best params)
rf = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
rf_holdout_pred = rf.predict(X_holdout_scaled)
rf_rmse = np.sqrt(mean_squared_error(y_holdout, rf_holdout_pred))
print(f"RF RMSE: {rf_rmse:.4f}")

# XGBoost with reverted params
xgb_model = xgb.XGBRegressor(n_estimators=130, learning_rate=0.06, max_depth=6, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_holdout_pred = xgb_model.predict(X_holdout_scaled)
xgb_rmse = np.sqrt(mean_squared_error(y_holdout, xgb_holdout_pred))
print(f"XGBoost RMSE: {xgb_rmse:.4f}")

# LightGBM with minimal role
lgb_model = lgb.LGBMRegressor(n_estimators=150, learning_rate=0.04, max_depth=5, random_state=42, n_jobs=-1)
lgb_model.fit(X_train_scaled, y_train)
lgb_holdout_pred = lgb_model.predict(X_holdout_scaled)
lgb_rmse = np.sqrt(mean_squared_error(y_holdout, lgb_holdout_pred))
print(f"LightGBM RMSE: {lgb_rmse:.4f}")

# Cross-Validation for Weight Optimization (5-fold)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_rmse = float('inf')
best_weights = None
for train_idx, val_idx in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    lr_fold_pred = lr.predict(X_val_fold)
    rf_fold_pred = rf.predict(X_val_fold)
    xgb_fold_pred = xgb_model.predict(X_val_fold)
    lgb_fold_pred = lgb_model.predict(X_val_fold)
    
    weight_combos = [
        (0.098, 0.602, 0.299, 0.001), (0.099, 0.601, 0.298, 0.002),
        (0.100, 0.600, 0.297, 0.003), (0.101, 0.599, 0.296, 0.004),
        (0.098, 0.601, 0.298, 0.003), (0.099, 0.600, 0.297, 0.004)
    ]  # (LR, RF, XGB, LGBM) summing to 1
    for w_lr, w_rf, w_xgb, w_lgb in weight_combos:
        ensemble_fold_pred = (w_lr * lr_fold_pred + w_rf * rf_fold_pred + 
                             w_xgb * xgb_fold_pred + w_lgb * lgb_fold_pred)
        fold_rmse = np.sqrt(mean_squared_error(y_val_fold, ensemble_fold_pred))
        if fold_rmse < best_rmse:
            best_rmse = fold_rmse
            best_weights = (w_lr, w_rf, w_xgb, w_lgb)

print(f"Best Weights (5-fold CV-averaged): {best_weights}, Best CV RMSE: {best_rmse:.4f}")

# Generate test predictions with best weights
lr_test_pred = lr.predict(X_test_scaled)
rf_test_pred = rf.predict(X_test_scaled)
xgb_test_pred = xgb_model.predict(X_test_scaled)
lgb_test_pred = lgb_model.predict(X_test_scaled)
test_pred = (best_weights[0] * lr_test_pred + best_weights[1] * rf_test_pred + 
             best_weights[2] * xgb_test_pred + best_weights[3] * lgb_test_pred)
submission['BeatsPerMinute'] = test_pred
submission.to_csv('submission_ensemble_stabilized.csv', index=False)
print("Submission file created: submission_ensemble_stabilized.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['AudioLoudness'] = np.log1p(-X['AudioLoudness'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['AudioLoudness'] = np.log1p(-X_test['AudioLoudness'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TrackDurationMs'] = np.log1p(X['TrackDurationMs'])
A value is trying to be set on a copy o

Linear RMSE: 26.4440
RF RMSE: 26.4398
XGBoost RMSE: 26.4514
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 419331, number of used features: 9
[LightGBM] [Info] Start training from score 119.056554




LightGBM RMSE: 26.4419




Best Weights (5-fold CV-averaged): (0.098, 0.602, 0.299, 0.001), Best CV RMSE: 26.2632




Submission file created: submission_ensemble_stabilized.csv
