In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# XGBoost with default hyperparameters

In [2]:
# Load your cleaned dataset 
df = pd.read_csv('/Users/sa21/Desktop/Podcast_Prediction/Data/processed/df_train_cleaned.csv')

In [3]:
# Split target and features
X = df.drop(columns=['Listening_Time_minutes'])
y = df['Listening_Time_minutes']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract and store validation IDs
id_val = X_val['id']
X_train = X_train.drop(columns=['id'])
X_val = X_val.drop(columns=['id'])

# XGBOOST MODEL
xgb_default = XGBRegressor(random_state=42, verbosity=0)
xgb_default.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_default.predict(X_val)

# Evaluate
print("XGBoost Default Evaluation:")
print(f"MSE: {mean_squared_error(y_val, y_pred_xgb):.3f}")
print(f"R² Score: {r2_score(y_val, y_pred_xgb):.3f}")

# Save CSV
df_xgb_default = pd.DataFrame({
    'id': id_val.values,
    'Listening_Time_minutes': y_pred_xgb
})
df_xgb_default.to_csv('/Users/sa21/Desktop/Podcast_Prediction/Data/predictions/xgb_default_predictions.csv', index=False)


XGBoost Default Evaluation:
MSE: 170.072
R² Score: 0.769


# XGBoost with Randomized Search

In [4]:

# Sample a smaller training 
X_sample = X_train.sample(frac=0.15, random_state=42)
y_sample = y_train.loc[X_sample.index]

# Parameter space
xgb_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_random_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42, verbosity=0),
    param_distributions=xgb_param_dist,
    n_iter=10,
    cv=2,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

# Fit on sample
xgb_random_search.fit(X_sample, y_sample)
best_xgb_model = xgb_random_search.best_estimator_

# Predict and evaluate
y_pred_xgb_tuned = best_xgb_model.predict(X_val)
print("XGBoost RandomSearchCV Evaluation:")
print(f"MSE: {mean_squared_error(y_val, y_pred_xgb_tuned):.3f}")
print(f"R² Score: {r2_score(y_val, y_pred_xgb_tuned):.3f}")

# Save CSV
df_xgb_random = pd.DataFrame({
    'id': id_val.values,
    'Listening_Time_minutes': y_pred_xgb_tuned
})
df_xgb_random.to_csv('/Users/sa21/Desktop/Podcast_Prediction/Data/predictions/xgb_randomsearch_predictions.csv', index=False)





XGBoost RandomSearchCV Evaluation:
MSE: 172.999
R² Score: 0.765
