In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
import warnings 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_squared_error

import optuna

In [3]:
train_data = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")

In [4]:
# Configuration 
numerical_variables = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorial_valiable = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'] 
target_variable = 'Listening_Time_minutes'

In [5]:
# Utility function 

def noise_removal(df):
    df["Number_of_Ads"] = df['Number_of_Ads'].apply(lambda x: x if x < 4 else 4)
    df["Episode_Length_minutes"] = df['Episode_Length_minutes'].apply(lambda x: x if x < 150 else 150)
    return df

def missing_value_imputation(df):
    df["Episode_Length_minutes"] = df.groupby("Podcast_Name")["Episode_Length_minutes"].transform(lambda x: x.fillna(x.mean()))
    df["Guest_Popularity_percentage"] = df.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform(lambda x: x.fillna(x.mean()))
    df["Number_of_Ads"] = df["Number_of_Ads"].fillna(2)
    return df

def bin_popularity(value):
    if value < 20:
        return 'Low'
    elif value < 60:
        return 'Medium'
    else:
        return 'High'

INTERACT = []
def feature_engineering(train, test):
    global numerical_variables
    # Numerical columns 
    ## Interaction features
    if len(INTERACT) > 0:
        INTERACT.clear()
    for i,c1 in enumerate(numerical_variables):
        for j,c2 in enumerate(numerical_variables[i+1:]):
            feature = f"{c1}_{c2}"
            train[feature] = train[c1] * train[c2]
            test[feature] = test[c1] * test[c2]
            INTERACT.append(feature)
    numerical_variables += INTERACT
    print(f"There are {len(INTERACT)} interaction features:")
    print(INTERACT)
    
    train['Host_Guest_Avg_Popularity'] = (train['Host_Popularity_percentage'] + train['Guest_Popularity_percentage']) / 2
    test['Host_Guest_Avg_Popularity'] = (test['Host_Popularity_percentage'] + test['Guest_Popularity_percentage']) / 2
    numerical_variables.append("Host_Guest_Avg_Popularity")
    
    train["Host_Popularity_Level"] = train["Host_Popularity_percentage"].apply(bin_popularity)
    test["Host_Popularity_Level"] = test["Host_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Host_Popularity_Level")
    train["Guest_Popularity_Level"] = train["Guest_Popularity_percentage"].apply(bin_popularity)
    test["Guest_Popularity_Level"] = test["Guest_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Guest_Popularity_Level")
    
    train['Has_Ads'] = train['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    test['Has_Ads'] = test['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    
    train["Ads_Per_Minutes"] = train["Episode_Length_minutes"] / (train["Number_of_Ads"]+1)
    test["Ads_Per_Minutes"] = test["Episode_Length_minutes"] / (test["Number_of_Ads"]+1)
    numerical_variables.append("Ads_Per_Minutes")

    # Categorical features
    train['Is_Weekend'] = train['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)
    test['Is_Weekend'] = test['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)

    return train, test

In [6]:
def one_hot_encode_features(train, test, categorical_features):
    # Initialize the OneHotEncoder with drop='first'
    encoder = OneHotEncoder(sparse=False)
    
    # Fit and transform the train set
    train_encoded = encoder.fit_transform(train[categorical_features])
    
    # Transform the test set
    test_encoded = encoder.transform(test[categorical_features])
    
    # Convert the encoded arrays to DataFrame
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_features))
    
    # Drop the original categorical columns from train and test DataFrames
    train.drop(columns=categorical_features, inplace=True)
    test.drop(columns=categorical_features, inplace=True)
    
    # Concatenate the one-hot encoded features to the original DataFrames
    train = pd.concat([train, train_encoded_df], axis=1)
    test = pd.concat([test, test_encoded_df], axis=1)
    
    return train, test


def standardize_features(train, test, numerical_features):
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit and transform the training data
    train_scaled = scaler.fit_transform(train[numerical_features])
    
    # Transform the test data
    test_scaled = scaler.transform(test[numerical_features])
    
    # Convert the scaled arrays back to DataFrames
    train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_features)
    test_scaled_df = pd.DataFrame(test_scaled, columns=numerical_features)
    
    # Replace the original columns with the standardized columns
    train[numerical_features] = train_scaled_df
    test[numerical_features] = test_scaled_df
    
    return train, test

In [7]:
# Data Processing 
train_data = noise_removal(train_data)
test_data = noise_removal(test_data)
train_data = missing_value_imputation(train_data)
test_data = missing_value_imputation(test_data)
train_data, test_data = feature_engineering(train_data, test_data)

There are 6 interaction features:
['Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads']


In [8]:
train_data, test_data = one_hot_encode_features(train_data, test_data, categorial_valiable)

In [9]:
X = train_data.drop(['Listening_Time_minutes', 'id'], axis=1)
y = train_data['Listening_Time_minutes']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(y_train), len(y_val))

600000 150000


In [10]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

def objective(trial):
    """Objective function for Optuna hyperparameter optimization of XGBoostRegressor with GPU support."""

    # Define the hyperparameters to optimize
    param = {
        "objective": "reg:squarederror",  # For regression tasks
        "eval_metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
        "eta": trial.suggest_float("eta", 1e-8, 1.0, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "tree_method": "gpu_hist",  # Enable GPU acceleration
        "gpu_id": 0,  # Specify the GPU ID (if you have multiple GPUs)
        "predictor": "gpu_predictor", #Use GPU for prediction.
        "nthread": -1, # use all available threads
        "num_boost_round" : trial.suggest_int('num_boost_round', 100, 5000)
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9)
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    elif param["booster"] == "gblinear":
        param["lambda"] = trial.suggest_float("lambda", 1e-8, 1.0, log=True)
        param["alpha"] = trial.suggest_float("alpha", 1e-8, 1.0, log=True)

    # Convert data to DMatrix format (required by XGBoost)
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Perform cross-validation with XGBoost
    scores = xgb.cv(
        param,
        dtrain,
        nfold=5,
        early_stopping_rounds=10, #adjust as needed.
        as_pandas=False,
        seed=42,
    )

    # Return the mean RMSE score (lower is better, so we negate it for Optuna's maximize)
    return -scores["test-rmse-mean"][-1] #return the last RMSE, which is the best.

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_params = study.best_params

[I 2025-04-04 13:34:08,938] A new study created in memory with name: no-name-5c12dd62-6e81-4bc6-a213-3b2b130acde7
[I 2025-04-04 13:34:26,333] Trial 0 finished with value: -27.14001732772286 and parameters: {'booster': 'gbtree', 'lambda': 3.0723674649127656e-08, 'alpha': 0.05315748950026862, 'subsample': 0.7882904742005996, 'colsample_bytree': 0.9190731714627476, 'max_depth': 4, 'min_child_weight': 9, 'eta': 6.5471794007495534e-06, 'gamma': 0.804759786927792, 'grow_policy': 'depthwise', 'num_boost_round': 2341}. Best is trial 0 with value: -27.14001732772286.
[I 2025-04-04 13:34:45,610] Trial 1 finished with value: -19.416965633100972 and parameters: {'booster': 'dart', 'lambda': 6.067608705603729e-06, 'alpha': 1.2980603675409632e-07, 'subsample': 0.9175787197523366, 'colsample_bytree': 0.7770405057320742, 'max_depth': 7, 'min_child_weight': 5, 'eta': 0.05068096719699404, 'gamma': 1.4757537117042872e-05, 'grow_policy': 'lossguide', 'num_boost_round': 2121}. Best is trial 1 with value: -

Number of finished trials:  100
Best trial:
  Value:  -13.150669434865918
  Params: 
    booster: dart
    lambda: 9.770320148460564e-08
    alpha: 1.612824838455072e-05
    subsample: 0.8609881565231172
    colsample_bytree: 0.8432633524521188
    max_depth: 8
    min_child_weight: 6
    eta: 0.42989406584229434
    gamma: 0.000734224353473736
    grow_policy: lossguide
    num_boost_round: 2756


In [12]:
import optuna.visualization as vis
from IPython.display import IFrame, display

# Plot the optimization history (objective value over trials)
fig_history = vis.plot_optimization_history(study)
fig_history.write_html("optimization_history.html")

# Plot the parameter importances
fig_importances = vis.plot_param_importances(study)
fig_importances.write_html("parameter_importances.html")

# Display the saved HTML files inline using IFrame
display(IFrame(src="optimization_history.html", width="100%", height=500))
display(IFrame(src="parameter_importances.html", width="100%", height=500))

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

best_params["tree_method"] = "gpu_hist" #ensure GPU
best_params["gpu_id"] = 0
best_params["predictor"] = "gpu_predictor"

best_model = xgb.train(
    best_params,
    dtrain,
    evals=[(dtrain, "train"), (dval, "test")],
    early_stopping_rounds=10,
)

[0]	train-rmse:18.94245	test-rmse:18.92701
[1]	train-rmse:15.34439	test-rmse:15.33499
[2]	train-rmse:13.91457	test-rmse:13.91857
[3]	train-rmse:13.39425	test-rmse:13.40852
[4]	train-rmse:13.19821	test-rmse:13.22108
[5]	train-rmse:13.12238	test-rmse:13.15816
[6]	train-rmse:13.07915	test-rmse:13.12856
[7]	train-rmse:13.05228	test-rmse:13.11414
[8]	train-rmse:13.02579	test-rmse:13.10716
[9]	train-rmse:13.00844	test-rmse:13.10177


In [14]:
X_test = test_data.drop(['id'], axis=1)
dtest = xgb.DMatrix(X_test)
test_predictions = best_model.predict(dtest)

In [15]:
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': test_predictions  # Predicted probabilities for rainfall
})

submission_df["Listening_Time_minutes"] = submission_df["Listening_Time_minutes"].clip(
    lower=0, upper=test_data["Episode_Length_minutes"])

submission_df.to_csv("submission_xgb_04_04_v1.csv", index=False)

In [16]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [19]:
def objective_lgbm(trial):
    """Objective function for Optuna hyperparameter optimization of LightGBM with GPU support."""

    param = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart", "goss"]),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "device": "gpu",  # Enable GPU acceleration
        "gpu_platform_id": 0, #adjust if you have multiple platforms.
        "gpu_device_id": 0, #adjust if you have multiple devices.
        "n_jobs": -1, #use all cores.
        "max_bin": trial.suggest_int("max_bin", 63, 255), #important for gpu performance.
        "verbose": -1, #suppress lightgbm output.
        "num_boost_round" : trial.suggest_int('num_boost_round', 100, 5000)
    }

    if param["boosting_type"] == "dart":
        param["drop_rate"] = trial.suggest_float("drop_rate", 0.01, 0.5)
        param["skip_drop"] = trial.suggest_float("skip_drop", 0.01, 0.5)

    # Load the dataset (example: California housing)
    dtrain = lgb.Dataset(X_train, label=y_train)

    scores = lgb.cv(
        param,
        dtrain,
        nfold=5,
        early_stopping_rounds=10, #adjust as needed.
        seed=42,
    )

    return -scores["rmse-mean"][-1] #return the last RMSE, which is the best.

In [20]:
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study_lgbm.trials))
print("Best trial:")
trial = study_lgbm.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_params = study_lgbm.best_params

[I 2025-04-04 14:18:59,360] A new study created in memory with name: no-name-659258f1-8ec3-4d79-8f72-dc0de6bd039b
[I 2025-04-04 14:19:16,645] Trial 0 finished with value: -13.448767872511988 and parameters: {'booster': 'gbtree', 'lambda': 0.0002206572445284353, 'alpha': 1.6647886255403664e-06, 'subsample': 0.7076342485980518, 'colsample_bytree': 0.5387637181869109, 'max_depth': 6, 'min_child_weight': 7, 'eta': 0.5502817907931551, 'gamma': 0.0003020840522319951, 'grow_policy': 'depthwise', 'num_boost_round': 3583}. Best is trial 0 with value: -13.448767872511988.
[I 2025-04-04 14:19:50,805] Trial 1 finished with value: -27.062396649909754 and parameters: {'booster': 'gblinear', 'lambda': 9.459629593449093e-08, 'alpha': 0.0004921478392276382, 'subsample': 0.8476841099899535, 'colsample_bytree': 0.8242246220677731, 'max_depth': 5, 'min_child_weight': 7, 'eta': 0.0007158509759606218, 'gamma': 2.2174413370579833e-08, 'grow_policy': 'lossguide', 'num_boost_round': 401}. Best is trial 0 with 

Number of finished trials:  100
Best trial:
  Value:  -13.153063931470722
  Params: 
    booster: dart
    lambda: 1.0688566030752065e-08
    alpha: 2.0008470252864347e-07
    subsample: 0.7258455740008215
    colsample_bytree: 0.9669207998867171
    max_depth: 8
    min_child_weight: 2
    eta: 0.41816603896355214
    gamma: 0.055128970324283245
    grow_policy: lossguide
    num_boost_round: 872


In [23]:
best_params

{'booster': 'dart',
 'lambda': 1.0688566030752065e-08,
 'alpha': 2.0008470252864347e-07,
 'subsample': 0.7258455740008215,
 'colsample_bytree': 0.9669207998867171,
 'max_depth': 8,
 'min_child_weight': 2,
 'eta': 0.41816603896355214,
 'gamma': 0.055128970324283245,
 'grow_policy': 'lossguide',
 'num_boost_round': 872,
 'device': 'gpu',
 'verbose': -1}

In [27]:
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

best_params["device"] = "gpu" #ensure GPU
best_params['verbose'] = -1

best_model = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dtrain, dval],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]  # Use early stopping callback
)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[112]	training's l2: 160.164	valid_1's l2: 170.052


In [28]:
dtrain = lgb.Dataset(X, label=y)

best_model = lgb.train(
    best_params,
    dtrain
)

In [30]:
X_test = test_data.drop(['id'], axis=1)
test_predictions = best_model.predict(X_test)

In [31]:
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': test_predictions  # Predicted probabilities for rainfall
})

submission_df["Listening_Time_minutes"] = submission_df["Listening_Time_minutes"].clip(
    lower=0, upper=test_data["Episode_Length_minutes"])

submission_df.to_csv("submission_lgbm_04_04_v1.csv", index=False)