In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import cuml
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv', index_col='id')

In [3]:
train.shape

(750000, 11)

In [4]:
test.shape

(250000, 10)

## Data Understanding

In [5]:
train.sample(10)

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
276848,Mind & Body,Episode 81,52.6,Health,41.39,Saturday,Morning,90.63,0.0,Positive,38.12368
568883,Melody Mix,Episode 26,15.51,Music,30.35,Saturday,Evening,79.07,3.0,Negative,15.14766
111278,Educational Nuggets,Episode 30,33.44,Education,35.25,Tuesday,Evening,33.97,3.0,Positive,24.34936
600863,Sports Weekly,Episode 9,47.46,Sports,91.41,Thursday,Morning,15.72,2.0,Negative,43.5348
189383,Healthy Living,Episode 6,17.95,Health,54.06,Monday,Evening,24.59,2.0,Neutral,16.45588
427417,Digital Digest,Episode 95,114.56,Technology,84.31,Friday,Night,17.33,1.0,Neutral,60.89547
471322,Mystery Matters,Episode 50,102.05,True Crime,64.48,Tuesday,Afternoon,15.0,3.0,Neutral,78.18086
644041,Life Lessons,Episode 78,34.37,Lifestyle,95.72,Friday,Afternoon,69.42,1.0,Neutral,26.88779
407415,Business Insights,Episode 68,118.68,Business,45.58,Tuesday,Night,4.88,3.0,Positive,62.37219
148803,World Watch,Episode 59,53.65,News,20.8,Saturday,Evening,20.57,1.0,Neutral,35.55376


In [6]:
train.describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,662907.0,750000.0,603970.0,749999.0,750000.0
mean,64.504738,59.859901,52.236449,1.348855,45.437406
std,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,1.3,0.0,0.0,0.0
25%,35.73,39.41,28.38,0.0,23.17835
50%,63.84,60.05,53.58,1.0,43.37946
75%,94.07,79.53,76.6,2.0,64.81158
max,325.24,119.46,119.91,103.91,119.97


In [7]:
train.shape

(750000, 11)

In [8]:
test.shape

(250000, 10)

In [9]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   Podcast_Name                 750000 non-null  object
 1   Episode_Title                750000 non-null  object
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object
 6   Publication_Time             750000 non-null  object
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 88.8+ MB


In [10]:
train.isnull().sum()

Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [11]:
test.isnull().sum()

Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64

## Data Preparation

Reducing the memory space of the dataset by converting the num_cols into float32 and cat_cols into category

In [12]:
num_cols = test.select_dtypes(include=['float64']).columns
for col in num_cols:
    train[col] = train[col].astype('float32')
    test[col] = test[col].astype('float32')

In [13]:
cat_cols = train.select_dtypes(include=['object']).columns
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [14]:
from scipy.stats.mstats import winsorize

train['Number_of_Ads'] = winsorize(train['Number_of_Ads'], limits=(0.01, 0.01))
test['Number_of_Ads'] = winsorize(test['Number_of_Ads'], limits=(0.01, 0.01))

train['Listening_Time_minutes'] = winsorize(train['Listening_Time_minutes'], limits=(0.01, 0.01))

In [15]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   Podcast_Name                 750000 non-null  category
 1   Episode_Title                750000 non-null  category
 2   Episode_Length_minutes       662907 non-null  float32
 3   Genre                        750000 non-null  category
 4   Host_Popularity_percentage   750000 non-null  float32
 5   Publication_Day              750000 non-null  category
 6   Publication_Time             750000 non-null  category
 7   Guest_Popularity_percentage  603970 non-null  float32
 8   Number_of_Ads                750000 non-null  float32
 9   Episode_Sentiment            750000 non-null  category
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: category(6), float32(4), float64(1)
memory usage: 27.4 MB


In [16]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='median')
num_cols = test.select_dtypes(include=['float32']).columns
train[num_cols] = impute.fit_transform(train[num_cols])
test[num_cols] = impute.fit_transform(test[num_cols])

In [17]:
sentiments = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
train['Episode_Sentiment'] = train['Episode_Sentiment'].map(sentiments).astype('float32')
test['Episode_Sentiment'] = test['Episode_Sentiment'].map(sentiments).astype('float32')

## Feature engineering

## Count Encoding for high cardinality columns

In [18]:
from category_encoders import CountEncoder

ce = CountEncoder()
count_category = ['Podcast_Name', 'Genre', 'Episode_Title']
train[count_category] = ce.fit_transform(train[count_category])
test[count_category] = ce.transform(test[count_category])

In [19]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Publication_Day', 'Publication_Time']
for colname in cat_cols:
    le = LabelEncoder()
    train[colname] = le.fit_transform(train[colname])
    test[colname] = le.transform(test[colname])

In [20]:
from sklearn.preprocessing import KBinsDiscretizer

kmeans = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')
train['Kmeans_bins'] = kmeans.fit_transform(train[['Episode_Length_minutes']])
test['Kmeans_bins'] = kmeans.transform(test[['Episode_Length_minutes']])

In [21]:
for col in ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Length_minutes']:
    for k in range(0, 3):
        new_round_col = f"{col}_round{k}"
        train[new_round_col] = train[col].round(k)
        test[new_round_col] = test[col].round(k)

In [22]:
train['Guest_Popularity_percentage'] =  train['Guest_Popularity_percentage'].clip(upper=100)
train['Host_Popularity_percentage'] =  train['Host_Popularity_percentage'].clip(upper=100)
test['Guest_Popularity_percentage'] =  test['Guest_Popularity_percentage'].clip(upper=100)
test['Host_Popularity_percentage'] =  test['Host_Popularity_percentage'].clip(upper=100)


In [23]:
train.columns

Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes', 'Kmeans_bins',
       'Host_Popularity_percentage_round0',
       'Host_Popularity_percentage_round1',
       'Host_Popularity_percentage_round2',
       'Guest_Popularity_percentage_round0',
       'Guest_Popularity_percentage_round1',
       'Guest_Popularity_percentage_round2', 'Episode_Length_minutes_round0',
       'Episode_Length_minutes_round1', 'Episode_Length_minutes_round2'],
      dtype='object')

In [24]:
features = ['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Publication_Day', 'Publication_Time',
       'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Kmeans_bins', 'Host_Popularity_percentage_round0', 'Host_Popularity_percentage_round1',
       'Host_Popularity_percentage_round2', 'Guest_Popularity_percentage_round0', 'Guest_Popularity_percentage_round1',
       'Guest_Popularity_percentage_round2', 'Episode_Length_minutes_round0', 'Episode_Length_minutes_round1', 'Episode_Length_minutes_round2']

In [25]:
interact = []
for i, c1 in enumerate(features):
    for c2 in (features[i+1:]):
        n = f'{c1}_{c2}'
        train[n] = train[c1] * train[c2]
        test[n] = test[c1] * test[c2]
        interact.append(n)

print(f"There are {len(interact)} interaction features:")
print( interact )

There are 190 interaction features:
['Podcast_Name_Episode_Title', 'Podcast_Name_Episode_Length_minutes', 'Podcast_Name_Genre', 'Podcast_Name_Publication_Day', 'Podcast_Name_Publication_Time', 'Podcast_Name_Host_Popularity_percentage', 'Podcast_Name_Guest_Popularity_percentage', 'Podcast_Name_Number_of_Ads', 'Podcast_Name_Episode_Sentiment', 'Podcast_Name_Kmeans_bins', 'Podcast_Name_Host_Popularity_percentage_round0', 'Podcast_Name_Host_Popularity_percentage_round1', 'Podcast_Name_Host_Popularity_percentage_round2', 'Podcast_Name_Guest_Popularity_percentage_round0', 'Podcast_Name_Guest_Popularity_percentage_round1', 'Podcast_Name_Guest_Popularity_percentage_round2', 'Podcast_Name_Episode_Length_minutes_round0', 'Podcast_Name_Episode_Length_minutes_round1', 'Podcast_Name_Episode_Length_minutes_round2', 'Episode_Title_Episode_Length_minutes', 'Episode_Title_Genre', 'Episode_Title_Publication_Day', 'Episode_Title_Publication_Time', 'Episode_Title_Host_Popularity_percentage', 'Episode_Titl

In [26]:
from sklearn.model_selection import KFold
from cuml.metrics import mean_squared_error
from xgboost import XGBRegressor

n_folds = 10
test_preds = np.zeros(len(test))

kf = KFold(n_splits=n_folds, shuffle=True, random_state=34)

for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
    X_train, X_valid = train.iloc[train_idx][features], train.iloc[valid_idx][features]
    y_train, y_valid = train.iloc[train_idx]['Listening_Time_minutes'], train.iloc[valid_idx]['Listening_Time_minutes']

    xgb = XGBRegressor(
        n_estimators=600, 
        learning_rate=0.07, 
        reg_lambda=8.20664, 
        max_depth=11,
        colsample_bytree=0.5,
        subsample=0.8,
        device='cuda',
        tree_method='gpu_hist',
        eval_metric='rmse',
        verbosity=0,
        random_state=34
    )
    xgb.fit(X_train, y_train)
    
    valid_pred = xgb.predict(X_valid)
    fold_rmse = np.sqrt(mean_squared_error(y_valid, valid_pred))
    print(f"Fold {fold + 1} RMSE: {fold_rmse:.2f}")
    
    test_preds += xgb.predict(test[features]) / n_folds


Fold 1 RMSE: 12.73
Fold 2 RMSE: 12.75
Fold 3 RMSE: 12.66
Fold 4 RMSE: 12.61
Fold 5 RMSE: 12.58
Fold 6 RMSE: 12.59
Fold 7 RMSE: 12.63
Fold 8 RMSE: 12.66
Fold 9 RMSE: 12.70
Fold 10 RMSE: 12.68


## Final Submission

In [27]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e4/sample_submission.csv')
sub['Listening_Time_minutes'] = test_preds
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
