# Import libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Load data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Define features

In [4]:
features = ['RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 
            'InstrumentalScore', 'LivePerformanceLikelihood', 'MoodScore', 
            'TrackDurationMs', 'Energy']
X = train[features]
y = train['BeatsPerMinute']
X_test = test[features]

# Feature engineering: Log-transform skewed features

In [5]:
X['AudioLoudness'] = np.log1p(-X['AudioLoudness'])
X_test['AudioLoudness'] = np.log1p(-X_test['AudioLoudness'])
X['TrackDurationMs'] = np.log1p(X['TrackDurationMs'])
X_test['TrackDurationMs'] = np.log1p(X_test['TrackDurationMs'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['AudioLoudness'] = np.log1p(-X['AudioLoudness'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['AudioLoudness'] = np.log1p(-X_test['AudioLoudness'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TrackDurationMs'] = np.log1p(X['TrackDurationMs'])
A value is trying to be set on a copy o

# Split data (80/20 for holdout validation)

In [6]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest (your best params)

In [8]:
rf = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
rf_holdout_pred = rf.predict(X_holdout_scaled)
rf_rmse = np.sqrt(mean_squared_error(y_holdout, rf_holdout_pred))
print(f"RF Holdout RMSE: {rf_rmse:.4f}")

RF Holdout RMSE: 26.4398


# Train XGBoost with tuned params

In [9]:
xgb_model = xgb.XGBRegressor(n_estimators=120, learning_rate=0.05, max_depth=5, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_holdout_pred = xgb_model.predict(X_holdout_scaled)
xgb_rmse = np.sqrt(mean_squared_error(y_holdout, xgb_holdout_pred))
print(f"XGBoost Holdout RMSE: {xgb_rmse:.4f}")

XGBoost Holdout RMSE: 26.4439


# Holdout-Guided Weight Optimization (simple grid search)

In [10]:
weight_combos = [
    (0.7, 0.3),  # Heavy RF
    (0.6, 0.4),  # Balanced
    (0.5, 0.5),  # Equal
    (0.4, 0.6),  # Heavy XGB
    (0.3, 0.7)   # More XGB
]
best_rmse = float('inf')
best_weights = None
for w_rf, w_xgb in weight_combos:
    ensemble_holdout_pred = w_rf * rf_holdout_pred + w_xgb * xgb_holdout_pred
    ensemble_rmse = np.sqrt(mean_squared_error(y_holdout, ensemble_holdout_pred))
    print(f"Weights (RF: {w_rf}, XGB: {w_xgb}) RMSE: {ensemble_rmse:.4f}")
    if ensemble_rmse < best_rmse:
        best_rmse = ensemble_rmse
        best_weights = (w_rf, w_xgb)

print(f"Best Weights: {best_weights}, Best Holdout RMSE: {best_rmse:.4f}")

Weights (RF: 0.7, XGB: 0.3) RMSE: 26.4394
Weights (RF: 0.6, XGB: 0.4) RMSE: 26.4396
Weights (RF: 0.5, XGB: 0.5) RMSE: 26.4400
Weights (RF: 0.4, XGB: 0.6) RMSE: 26.4405
Weights (RF: 0.3, XGB: 0.7) RMSE: 26.4411
Best Weights: (0.7, 0.3), Best Holdout RMSE: 26.4394


# Generate test predictions with best weights

In [11]:
rf_test_pred = rf.predict(X_test_scaled)
xgb_test_pred = xgb_model.predict(X_test_scaled)
test_pred = best_weights[0] * rf_test_pred + best_weights[1] * xgb_test_pred
submission['BeatsPerMinute'] = test_pred
submission.to_csv('submission_ensemble_rf_xgb.csv', index=False)
print("Submission file created: submission_ensemble_rf_xgb.csv")

Submission file created: submission_ensemble_rf_xgb.csv
