In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')
all_data = pd.concat([train, test])

In [3]:
train.sample(10)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
114876,114876,Athlete's Arena,Episode 14,90.89,Sports,84.73,Monday,Evening,60.79,0.0,Positive,77.36534
200624,200624,Fashion Forward,Episode 18,41.37,Lifestyle,77.13,Wednesday,Night,,2.0,Neutral,33.13709
280292,280292,Sports Central,Episode 20,44.18,Sports,68.64,Tuesday,Night,96.74,0.0,Positive,28.93326
85318,85318,Humor Hub,Episode 77,116.71,Comedy,51.69,Saturday,Night,31.32,0.0,Neutral,110.88
660341,660341,News Roundup,Episode 47,34.41,News,68.73,Saturday,Morning,,1.0,Positive,27.35651
441613,441613,Learning Lab,Episode 15,63.41,Education,33.56,Sunday,Evening,30.52,2.0,Neutral,52.01943
724795,724795,Music Matters,Episode 14,,Music,58.39,Monday,Morning,52.66,2.0,Neutral,63.77349
680734,680734,Humor Hub,Episode 68,42.15,Comedy,74.68,Tuesday,Morning,82.75,0.0,Negative,22.92
469605,469605,Tech Talks,Episode 1,38.59,Technology,99.51,Saturday,Evening,86.51,2.0,Neutral,28.30617
571075,571075,Athlete's Arena,Episode 53,81.8,Sports,69.43,Tuesday,Night,86.93,0.0,Neutral,58.61642


In [4]:
train.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [5]:
train.shape

(750000, 12)

In [6]:
test.shape

(250000, 11)

In [7]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   id                           750000 non-null  int64
 1   Podcast_Name                 750000 non-null  object
 2   Episode_Title                750000 non-null  object
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object
 7   Publication_Time             750000 non-null  object
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 88.8+ MB


In [8]:
train.isnull().sum()

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [9]:
test.isnull().sum()

id                                 0
Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64

## Data Preprocessing

In [10]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='median')
num_cols = test.select_dtypes(include=['float64']).columns
train[num_cols] = impute.fit_transform(train[num_cols])
test[num_cols] = impute.fit_transform(test[num_cols])

## Feature engineering

In [11]:
from category_encoders import CountEncoder

def add_features(df):
    df['Guest_Popularity_percentage'] =  df['Guest_Popularity_percentage'].clip(upper=100)
    df['Host_Popularity_percentage'] =  df['Host_Popularity_percentage'].clip(upper=100)
    df['Number_of_Ads'] = df['Number_of_Ads'].astype(int)
    df['title_number'] = df['Episode_Title'].str.split().str[1].astype(int)
    
    df['IsWeekend'] = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)
    
    sentiments = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
    df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiments)
    
    df['Guest_Popularity_lengthmin'] =  df['Guest_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Title_Length'] =  df['title_number'] * df['Episode_Length_minutes']
    df['Host_Popularity_lengthmin'] =  df['Host_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Average_popularity'] = (df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage']) / 2
    df['minutes_per_ads'] = df['Episode_Length_minutes'] / (df['Number_of_Ads'] + 1) 
            
    cat_cols = ['Podcast_Name','Genre', 'Publication_Day', 'Publication_Time']
    for colname in cat_cols:
        df[colname] = df[colname].astype('category')

    ce = CountEncoder()
    for colname in cat_cols:
        df[colname] = ce.fit_transform(df[colname])
    
    return df 

## Identifying outlier and clipping them

In [12]:
for colname in test.select_dtypes(exclude='object').columns:
    Q1 = all_data[colname].quantile(0.25)
    Q3 = all_data[colname].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = (all_data[colname] < lower_bound) | (all_data[colname] > upper_bound)
    
    train[colname] = train[colname].clip(lower=lower_bound, upper=upper_bound)
    test[colname] = test[colname].clip(lower=lower_bound, upper=upper_bound)

## Splitting the train data by KFold

In [13]:
from sklearn.model_selection import KFold

X = train.copy()
y = X.pop('Listening_Time_minutes')
X = add_features(X)
X.drop('Episode_Title', axis=1, inplace=True)

X_test = test.copy()
X_test = add_features(X_test)
X_test.drop('Episode_Title', axis=1, inplace=True)

kf = KFold(n_splits=10, shuffle=True, random_state=600)

for train_index, valid_index in kf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]  

## Target encoding

In [14]:
stats = ['mean', 'median', 'std', 'skew']

train_with_target = X_train.copy()
train_with_target['Listening_Time_minutes'] = y_train

grouped_stats = train_with_target.groupby(['Episode_Length_minutes'])['Listening_Time_minutes'].agg(stats).fillna(0.0)
grouped_stats.columns = [f'TE1_WC_{stat}' for stat in stats]

grouped_stats_full = train.groupby(['Episode_Length_minutes'])['Listening_Time_minutes'].agg(stats).fillna(0.0)
grouped_stats_full.columns = [f'TE1_WC_{stat}' for stat in stats]

X_train = X_train.merge(grouped_stats, on='Episode_Length_minutes', how='left')
X_valid = X_valid.merge(grouped_stats, on='Episode_Length_minutes', how='left')
X_test = X_test.merge(grouped_stats_full, on='Episode_Length_minutes', how='left')

In [15]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

xgb = XGBRegressor(
    n_estimators=386, 
    learning_rate=0.05,
    verbose=0, 
    eval_metric='rmse',
    device='cuda'
).fit(X_train, y_train)

pred_xgb = xgb.predict(X_valid)
print(f'RMSE: {np.sqrt(mean_squared_error(y_valid, pred_xgb))}')


RMSE: 13.04670343161135


In [16]:
importances = xgb.feature_importances_
columns = X_train.columns

threshold = np.quantile(importances, 0.3)
selected_features = columns[importances > threshold]
top_X_train = X_train[selected_features]

print(selected_features)

Index(['Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'title_number', 'IsWeekend', 'Guest_Popularity_lengthmin',
       'Host_Popularity_lengthmin', 'minutes_per_ads', 'TE1_WC_mean',
       'TE1_WC_median', 'TE1_WC_std'],
      dtype='object')


In [17]:
top_X_train = X_train[selected_features]
top_X_valid = X_valid[selected_features]
top_X_test = X_test[selected_features]

## Hyperparameter Tuning using Optuna

In [18]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 13)
    reg_lambda = trial.suggest_float('reg_lambda', 0.1, 10)
    reg_alpha = trial.suggest_float('reg_alpha', 0.1, 10)
    sub_sample = trial.suggest_float('subsample', 0.4, 1)
    
    model = XGBRegressor(
        learning_rate=learning_rate,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        sub_sample= sub_sample,
        random_state=600, 
        eval_metric='rmse',
        device='cuda',
        tree_method='gpu_hist',    
    )
    
    score = -1 * cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error').mean()
    return score

study = optuna.create_study(direction='minimize', study_name='XGB-RMSE-Optimization')
study.optimize(objective, n_trials=13)

print(f'Best cross-validation RMSE: {study.best_value:,.4f}')
print(f'Best parameters: {study.best_params}')

[I 2025-04-11 13:19:32,835] A new study created in memory with name: XGB-RMSE-Optimization
[I 2025-04-11 13:19:57,037] Trial 0 finished with value: 23.279820088544984 and parameters: {'learning_rate': 0.0033274339437791016, 'max_depth': 1, 'reg_lambda': 2.9740231047549077, 'reg_alpha': 8.415192145926992, 'subsample': 0.985802003606952}. Best is trial 0 with value: 23.279820088544984.
[I 2025-04-11 13:20:35,732] Trial 1 finished with value: 24.8618102216476 and parameters: {'learning_rate': 0.001182052852521601, 'max_depth': 9, 'reg_lambda': 2.9721100152529307, 'reg_alpha': 7.703325778225823, 'subsample': 0.7575835608118217}. Best is trial 0 with value: 23.279820088544984.
[I 2025-04-11 13:20:59,778] Trial 2 finished with value: 24.69317916204563 and parameters: {'learning_rate': 0.001960307836256081, 'max_depth': 1, 'reg_lambda': 4.6976297887977605, 'reg_alpha': 9.369678037503688, 'subsample': 0.6044253583340908}. Best is trial 0 with value: 23.279820088544984.
[I 2025-04-11 13:21:24,4

Best cross-validation RMSE: 12.8757
Best parameters: {'learning_rate': 0.09665648553223233, 'max_depth': 13, 'reg_lambda': 6.691536561536285, 'reg_alpha': 0.2094626887125326, 'subsample': 0.8330993677042788}


In [19]:
best_model = XGBRegressor(
    **study.best_params,
    n_estimators=2000,
    early_stopping_rounds=20,
    eval_metric='rmse'
).fit(top_X_train, y_train, eval_set=[(top_X_valid, y_valid)], verbose=0)

In [20]:
test_preds = best_model.predict(top_X_test)
output = pd.DataFrame({
    'id': test['id'],
    'Listening_Time_minutes': test_preds
})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
