In [1]:
import polars as pl
import numpy as np

In [2]:
train_df = pl.read_csv("data/data/train.csv")


In [3]:
from sklearn.ensemble import RandomForestRegressor

train = train_df.filter(pl.col("Episode_Length_minutes").is_not_null()).to_pandas()
test  = train_df.filter(pl.col("Episode_Length_minutes").is_null()).to_pandas()

features = ["Host_Popularity_percentage", "Guest_Popularity_percentage", "Number_of_Ads"]
X_train = train[features]
y_train = train["Episode_Length_minutes"]
X_test  = test[features]

# fit & predict
rf_impute = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_impute.fit(X_train, y_train)
preds = rf_impute.predict(X_test)

test["Episode_Length_imp"] = preds
train["Episode_Length_imp"] = train["Episode_Length_minutes"]

# combine
import pandas as pd
combined = pd.concat([train, test], sort=False)
df = pl.from_pandas(combined)

In [4]:
df = df.with_columns([
  # ad & popularity
  (pl.col("Number_of_Ads")/pl.col("Episode_Length_imp")).alias("ad_density"),
  (pl.col("Host_Popularity_percentage") - pl.col("Guest_Popularity_percentage").fill_null(0)).alias("pop_gap"),
  ((pl.col("Host_Popularity_percentage")+pl.col("Guest_Popularity_percentage").fill_null(0))/2).alias("pop_avg"),
  pl.col("Guest_Popularity_percentage").is_not_null().cast(pl.Int8).alias("has_guest"),

  # title parsing
  pl.col("Episode_Title").str.extract(r"(\d+)").cast(pl.Int64).alias("Episode_Number"),
  pl.col("Episode_Title").str.len_chars().alias("Title_Char_Count"),
  pl.col("Episode_Title").str.split(" ").list.len().alias("Title_Word_Count"),

  # temporal
  pl.when(pl.col("Publication_Day").is_in(["Saturday","Sunday"])).then(1).otherwise(0).alias("is_weekend"),
  pl.col("Publication_Day").map_elements(lambda d: ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"].index(d)+1).alias("day_of_week_num"),
  pl.col("Publication_Time").map_elements(lambda t: {"Morning":1,"Afternoon":2,"Evening":3,"Night":4}[t]).alias("time_of_day_num"),

  # sentiment
  pl.when(pl.col("Episode_Sentiment")=="Positive").then(1)
    .when(pl.col("Episode_Sentiment")=="Neutral").then(0)
    .when(pl.col("Episode_Sentiment")=="Negative").then(-1)
    .alias("sentiment_num"),

  # transforms & interactions
  pl.col("Episode_Length_imp").log1p().alias("log_length"),
  pl.col("Number_of_Ads").log1p().alias("log_ads"),
  (pl.col("Episode_Length_imp") * pl.col("Number_of_Ads")).alias("length_x_ads"),
])



In [5]:
print(df)

shape: (750_000, 27)
┌────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ id     ┆ Podcast_Na ┆ Episode_Ti ┆ Episode_Le ┆ … ┆ sentiment ┆ log_lengt ┆ log_ads  ┆ length_x_ │
│ ---    ┆ me         ┆ tle        ┆ ngth_minut ┆   ┆ _num      ┆ h         ┆ ---      ┆ ads       │
│ i64    ┆ ---        ┆ ---        ┆ es         ┆   ┆ ---       ┆ ---       ┆ f64      ┆ ---       │
│        ┆ str        ┆ str        ┆ ---        ┆   ┆ i32       ┆ f64       ┆          ┆ f64       │
│        ┆            ┆            ┆ f64        ┆   ┆           ┆           ┆          ┆           │
╞════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 1      ┆ Joke       ┆ Episode 26 ┆ 119.8      ┆ … ┆ -1        ┆ 4.794136  ┆ 1.098612 ┆ 239.6     │
│        ┆ Junction   ┆            ┆            ┆   ┆           ┆           ┆          ┆           │
│ 2      ┆ Study      ┆ Episode 16 ┆ 73.9       ┆ … ┆ -1        ┆ 4.31

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [7]:
feature_cols = [
    # original numerics
    "Episode_Length_imp", "Host_Popularity_percentage", "Guest_Popularity_percentage",
    "Number_of_Ads",
    # engineered
    "ad_density", "pop_gap", "pop_avg", "has_guest",
    "Episode_Number", "Title_Char_Count", "Title_Word_Count",
    "is_weekend", "day_of_week_num", "time_of_day_num",
    "sentiment_num", "log_length", "log_ads", "length_x_ads",
]

In [8]:
X = df.select(feature_cols).to_numpy()
y = df.select("Listening_Time_minutes").to_numpy().ravel()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4a) Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# predictions
y_pred_dt_all  = dt.predict(X)        # in‑sample
y_pred_dt_test = dt.predict(X_test)   # hold‑out

# RMSE
mse_dt_all  = mean_squared_error(y, y_pred_dt_all)
mse_dt_test = mean_squared_error(y_test, y_pred_dt_test)


# 4b) Random Forest
rf = RandomForestRegressor(
    n_estimators=100, random_state=42, n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred_rf_all  = rf.predict(X)
y_pred_rf_test = rf.predict(X_test)

rmse_rf_all  = mean_squared_error(y, y_pred_rf_all)
rmse_rf_test = mean_squared_error(y_test, y_pred_rf_test)

# 5) Report
print(f"Decision Tree    RMSE (all): {np.sqrt(mse_dt_all):.2f}   —  RMSE (test): {np.sqrt(mse_dt_test):.2f}")
print(f"Random Forest    RMSE (all): {np.sqrt(rmse_rf_all):.2f}   —  RMSE (test): {np.sqrt(rmse_rf_test):.2f}")

Decision Tree    RMSE (all): 8.49   —  RMSE (test): 18.90
Random Forest    RMSE (all): 7.39   —  RMSE (test): 13.18


In [11]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

# 4) Fit
xgb.fit(X_train, y_train)

# 5) Predictions
y_pred_all  = xgb.predict(X)        # in‑sample
y_pred_test = xgb.predict(X_test)   # hold‑out

# 6) Compute RMSE
rmse_all  = mean_squared_error(y, y_pred_all)
rmse_test = mean_squared_error(y_test, y_pred_test)

print(f"XGBoost RMSE (all):  {np.sqrt(rmse_all):.2f}")
print(f"XGBoost RMSE (test): {np.sqrt(rmse_test):.2f}")

XGBoost RMSE (all):  13.36
XGBoost RMSE (test): 13.37


## TEST DF TIMEEE

In [12]:
test_df = pl.read_csv("data/data/test.csv")

In [13]:
train = test_df.filter(pl.col("Episode_Length_minutes").is_not_null()).to_pandas()
test  = test_df.filter(pl.col("Episode_Length_minutes").is_null()).to_pandas()

features = ["Host_Popularity_percentage", "Guest_Popularity_percentage", "Number_of_Ads"]
X_test  = test[features]

preds = rf_impute.predict(X_test)

test["Episode_Length_imp"] = preds
train["Episode_Length_imp"] = train["Episode_Length_minutes"]

# combine
import pandas as pd
combined = pd.concat([train, test], sort=False)
df = pl.from_pandas(combined)

In [14]:
df = df.with_columns([
  # ad & popularity
  (pl.col("Number_of_Ads")/pl.col("Episode_Length_imp")).alias("ad_density"),
  (pl.col("Host_Popularity_percentage") - pl.col("Guest_Popularity_percentage").fill_null(0)).alias("pop_gap"),
  ((pl.col("Host_Popularity_percentage")+pl.col("Guest_Popularity_percentage").fill_null(0))/2).alias("pop_avg"),
  pl.col("Guest_Popularity_percentage").is_not_null().cast(pl.Int8).alias("has_guest"),

  # title parsing
  pl.col("Episode_Title").str.extract(r"(\d+)").cast(pl.Int64).alias("Episode_Number"),
  pl.col("Episode_Title").str.len_chars().alias("Title_Char_Count"),
  pl.col("Episode_Title").str.split(" ").list.len().alias("Title_Word_Count"),

  # temporal
  pl.when(pl.col("Publication_Day").is_in(["Saturday","Sunday"])).then(1).otherwise(0).alias("is_weekend"),
  pl.col("Publication_Day").map_elements(lambda d: ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"].index(d)+1).alias("day_of_week_num"),
  pl.col("Publication_Time").map_elements(lambda t: {"Morning":1,"Afternoon":2,"Evening":3,"Night":4}[t]).alias("time_of_day_num"),

  # sentiment
  pl.when(pl.col("Episode_Sentiment")=="Positive").then(1)
    .when(pl.col("Episode_Sentiment")=="Neutral").then(0)
    .when(pl.col("Episode_Sentiment")=="Negative").then(-1)
    .alias("sentiment_num"),

  # transforms & interactions
  pl.col("Episode_Length_imp").log1p().alias("log_length"),
  pl.col("Number_of_Ads").log1p().alias("log_ads"),
  (pl.col("Episode_Length_imp") * pl.col("Number_of_Ads")).alias("length_x_ads"),
])



In [15]:
# Prepare features for prediction
feature_cols = [
    # original numerics
    "Episode_Length_imp", "Host_Popularity_percentage", "Guest_Popularity_percentage",
    "Number_of_Ads",
    # engineered
    "ad_density", "pop_gap", "pop_avg", "has_guest",
    "Episode_Number", "Title_Char_Count", "Title_Word_Count",
    "is_weekend", "day_of_week_num", "time_of_day_num",
    "sentiment_num", "log_length", "log_ads", "length_x_ads",
]

# Extract features for prediction
X_test_pred = df[feature_cols]

# Make predictions using the random forest model
predictions = rf.predict(X_test_pred)

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'Listening_Time_minutes': predictions
})

# Ensure directory exists
import os
os.makedirs('submissions', exist_ok=True)

# Save to CSV
submission.to_csv('submissions/random_forest.csv', index=False)

print(f"Predictions saved to submissions/random_forest.csv")
print(f"Number of predictions: {len(predictions)}")




Predictions saved to submissions/random_forest.csv
Number of predictions: 250000


In [16]:
# Prepare features for prediction
feature_cols = [
    # original numerics
    "Episode_Length_imp", "Host_Popularity_percentage", "Guest_Popularity_percentage",
    "Number_of_Ads",
    # engineered
    "ad_density", "pop_gap", "pop_avg", "has_guest",
    "Episode_Number", "Title_Char_Count", "Title_Word_Count",
    "is_weekend", "day_of_week_num", "time_of_day_num",
    "sentiment_num", "log_length", "log_ads", "length_x_ads",
]

# Extract features for prediction
X_test_pred = df[feature_cols]

# Make predictions using the random forest model
predictions = xgb.predict(X_test_pred)

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'Listening_Time_minutes': predictions
})

# Ensure directory exists
import os
os.makedirs('submissions', exist_ok=True)

# Save to CSV
submission.to_csv('submissions/xgb.csv', index=False)

print(f"Predictions saved to submissions/xgb.csv")
print(f"Number of predictions: {len(predictions)}")


Predictions saved to submissions/xgb.csv
Number of predictions: 250000
