In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv', index_col='id')

## Data Understanding

In [3]:
train.sample(10)

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
24828,Sports Central,Episode 8,115.52,Sports,75.32,Monday,Morning,,3.0,Positive,70.51893
14286,Health Hour,Episode 93,75.06,Health,68.9,Wednesday,Evening,,0.0,Positive,53.88455
257492,Tech Trends,Episode 58,86.9,Technology,79.11,Sunday,Night,8.79,3.0,Neutral,62.79904
748209,News Roundup,Episode 52,114.13,News,80.31,Tuesday,Afternoon,24.5,1.0,Negative,68.50424
303003,Sports Central,Episode 86,102.34,Sports,34.85,Saturday,Morning,29.85,1.0,Neutral,50.36846
218662,Tune Time,Episode 75,118.71,Music,78.61,Friday,Night,39.3,0.0,Negative,117.88
411624,Brain Boost,Episode 38,61.99,Education,47.69,Wednesday,Evening,45.8,3.0,Positive,30.59515
324169,Health Hour,Episode 100,55.91,Health,78.57,Saturday,Evening,58.78,0.0,Neutral,34.55593
18689,Sports Central,Episode 69,106.39,Sports,87.02,Sunday,Afternoon,62.45,1.0,Negative,57.12341
478213,Tech Trends,Episode 48,47.54,Technology,59.45,Tuesday,Evening,,0.0,Neutral,19.11088


In [4]:
train.describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,662907.0,750000.0,603970.0,749999.0,750000.0
mean,64.504738,59.859901,52.236449,1.348855,45.437406
std,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,1.3,0.0,0.0,0.0
25%,35.73,39.41,28.38,0.0,23.17835
50%,63.84,60.05,53.58,1.0,43.37946
75%,94.07,79.53,76.6,2.0,64.81158
max,325.24,119.46,119.91,103.91,119.97


In [5]:
train.shape

(750000, 11)

In [6]:
test.shape

(250000, 10)

In [7]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   Podcast_Name                 750000 non-null  object
 1   Episode_Title                750000 non-null  object
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object
 6   Publication_Time             750000 non-null  object
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 88.8+ MB


In [8]:
train.isnull().sum()

Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [9]:
test.isnull().sum()

Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64

## Data Preparation

Reducing the memory space of the dataset by converting the num_cols into float32 and cat_cols into category

In [10]:
num_cols = test.select_dtypes(include=['float64']).columns
for col in num_cols:
    train[col] = train[col].astype('float32')
    test[col] = test[col].astype('float32')

In [11]:
cat_cols = train.select_dtypes(include=['object']).columns
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [12]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype
---  ------                       --------------   -----
 0   Podcast_Name                 750000 non-null  category
 1   Episode_Title                750000 non-null  category
 2   Episode_Length_minutes       662907 non-null  float32
 3   Genre                        750000 non-null  category
 4   Host_Popularity_percentage   750000 non-null  float32
 5   Publication_Day              750000 non-null  category
 6   Publication_Time             750000 non-null  category
 7   Guest_Popularity_percentage  603970 non-null  float32
 8   Number_of_Ads                749999 non-null  float32
 9   Episode_Sentiment            750000 non-null  category
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: category(6), float32(4), float64(1)
memory usage: 27.5 MB


In [13]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='median')
num_cols = test.select_dtypes(include=['float32']).columns
train[num_cols] = impute.fit_transform(train[num_cols])
test[num_cols] = impute.fit_transform(test[num_cols])

In [14]:
test.isnull().sum()

Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64

In [15]:
from sklearn.preprocessing import KBinsDiscretizer

ew = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
train['Host_bins'] = ew.fit_transform(train[['Episode_Length_minutes']])
test['Host_bins'] = ew.transform(test[['Episode_Length_minutes']])

In [16]:
from sklearn.preprocessing import KBinsDiscretizer

kmeans = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')
train['Kmeans_bins'] = kmeans.fit_transform(train[['Episode_Length_minutes']])
test['Kmeans_bins'] = kmeans.transform(test[['Episode_Length_minutes']])

## Feature engineering

In [17]:
def add_features(df):
    df['Guest_Popularity_percentage'] =  df['Guest_Popularity_percentage'].clip(upper=100)
    df['Host_Popularity_percentage'] =  df['Host_Popularity_percentage'].clip(upper=100)
    
    df['IsWeekend'] = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)
    
    sentiments = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
    df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiments)
    
    df['Guest_Popularity_lengthmin'] =  df['Guest_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Host_Popularity_lengthmin'] =  df['Host_Popularity_percentage'] * df['Episode_Length_minutes']
    df['Average_popularity'] = (df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage']) / 2
    df['minutes_per_ads'] = df['Episode_Length_minutes'] / (df['Number_of_Ads'] + 1e-3) 
    
    return df 

## Count Encoding for high cardinality columns

In [18]:
from category_encoders import CountEncoder

ce = CountEncoder()
count_category = ['Podcast_Name', 'Genre', 'Episode_Title']
train[count_category] = ce.fit_transform(train[count_category])
test[count_category] = ce.transform(test[count_category])

In [19]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Publication_Day', 'Publication_Time']
for colname in cat_cols:
    le = LabelEncoder()
    train[colname] = le.fit_transform(train[colname])
    test[colname] = le.transform(test[colname])

In [20]:
cat_features = ['Publication_Day', 'Publication_Time', 'Podcast_Name', 'Genre', 'Episode_Sentiment', 'Episode_Title', 'Host_bins', 'Kmeans_bins']
combo = []
def combo_1(df):
    for i, col1 in enumerate(cat_features):
        for col2 in (cat_features[i+1:]):
            new_col = f'{col1}_{col2}'
            df[new_col] = df[col1] * df[col2]
            combo.append(new_col)
    return df

## Splitting the train data by KFold

In [21]:
from sklearn.model_selection import KFold

X = train.copy()
y = X.pop('Listening_Time_minutes')
X = add_features(X)
X = combo_1(X)

X_test = test.copy()
X_test = add_features(X_test)
X_test = combo_1(X_test)

kf = KFold(n_splits=10, shuffle=True, random_state=600)

for train_index, valid_index in kf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]  

## Target encoding

In [22]:
stats = ['mean', 'median', 'std', 'skew', 'count', 'max', 'min', 'nunique']
stats2 = ['mean', 'median', 'std']

In [23]:
train_with_target = X_train.copy()
train_with_target['Listening_Time_minutes'] = y_train

grouped_stats = train_with_target.groupby(['Episode_Length_minutes'])['Listening_Time_minutes'].agg(stats).fillna(0.0)
grouped_stats.columns = [f'TE1_WC_{stat}' for stat in stats]

grouped_stats_full = train.groupby(['Episode_Length_minutes'])['Listening_Time_minutes'].agg(stats).fillna(0.0)
grouped_stats_full.columns = [f'TE1_WC_{stat}' for stat in stats]

X_train = X_train.merge(grouped_stats, on='Episode_Length_minutes', how='left').astype('float32')
X_valid = X_valid.merge(grouped_stats_full, on='Episode_Length_minutes', how='left').astype('float32')
X_test = X_test.merge(grouped_stats_full, on='Episode_Length_minutes', how='left').astype('float32')

In [24]:
quantiles = [5, 10, 25, 40, 75, 95, 99]

for q in quantiles:
    result = train_with_target.groupby('Episode_Length_minutes')['Listening_Time_minutes'] \
        .quantile(q/100).reset_index().fillna(0.0)
    result.rename(columns={'Listening_Time_minutes': f'quantile_{q}'}, inplace=True)
    X_train = X_train.merge(result, on='Episode_Length_minutes', how='left').astype('float32')
    X_valid = X_valid.merge(result, on='Episode_Length_minutes', how='left').astype('float32')
    X_test = X_test.merge(result, on='Episode_Length_minutes', how='left').astype('float32')

In [25]:
for col in combo:
    result = train_with_target.groupby(col)['Listening_Time_minutes'].agg(stats2).fillna(0.0)
    # Include the column name in the generated statistic columns
    result.columns = [f'TE2_WC_{col}_{stat}' for stat in stats2]
    X_train = X_train.merge(result, on=col, how='left', suffixes=('', f'_{col}')).astype('float32')
    X_valid = X_valid.merge(result, on=col, how='left', suffixes=('', f'_{col}')).astype('float32')
    X_test = X_test.merge(result, on=col, how='left', suffixes=('', f'_{col}')).astype('float32')

In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

xgb = XGBRegressor(
    n_estimators=386, 
    learning_rate=0.07, 
    max_depth=11, 
    random_state=600, 
    reg_lambda=8.206644301085248, 
    device='cuda',
    tree_method='gpu_hist',
    verbose=0, 
    eval_metric='rmse'
    ).fit(X_train, y_train)
pred_xgb = xgb.predict(X_valid)
print(f'RMSE: {np.sqrt(mean_squared_error(y_valid, pred_xgb))}')

RMSE: 12.637752970130341


## Final Submission

In [27]:
test_preds = xgb.predict(X_test)

sub = pd.read_csv('/kaggle/input/playground-series-s5e4/sample_submission.csv')
sub['Listening_Time_minutes'] = test_preds
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
