In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
import warnings 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error

import optuna

# Data Reading

In [3]:
# Data reading 
train_data = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")

In [4]:
# Configuration 
numerical_variables = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorial_valiable = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'] 
target_variable = 'Listening_Time_minutes'

# Utility

In [5]:
def noise_removal(df):
    df["Number_of_Ads"] = df['Number_of_Ads'].apply(lambda x: x if x < 4 else 4)
    df["Episode_Length_minutes"] = df['Episode_Length_minutes'].apply(lambda x: x if x < 150 else 150)
    return df

In [6]:
def missing_value_imputation(df):
    df["Episode_Length_minutes"] = df.groupby("Podcast_Name")["Episode_Length_minutes"].transform(lambda x: x.fillna(x.mean()))
    df["Guest_Popularity_percentage"] = df.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform(lambda x: x.fillna(x.mean()))
    df["Number_of_Ads"] = df["Number_of_Ads"].fillna(2)
    return df

In [7]:
def bin_popularity(value):
    if value < 20:
        return 'Low'
    elif value < 60:
        return 'Medium'
    else:
        return 'High'

INTERACT = []
def feature_engineering(train, test):
    global numerical_variables
    # Numerical columns 
    ## Interaction features
    if len(INTERACT) > 0:
        INTERACT.clear()
    for i,c1 in enumerate(numerical_variables):
        for j,c2 in enumerate(numerical_variables[i+1:]):
            feature = f"{c1}_{c2}"
            train[feature] = train[c1] * train[c2]
            test[feature] = test[c1] * test[c2]
            INTERACT.append(feature)
    numerical_variables += INTERACT
    print(f"There are {len(INTERACT)} interaction features:")
    print(INTERACT)
    
    train['Host_Guest_Avg_Popularity'] = (train['Host_Popularity_percentage'] + train['Guest_Popularity_percentage']) / 2
    test['Host_Guest_Avg_Popularity'] = (test['Host_Popularity_percentage'] + test['Guest_Popularity_percentage']) / 2
    numerical_variables.append("Host_Guest_Avg_Popularity")
    
    train["Host_Popularity_Level"] = train["Host_Popularity_percentage"].apply(bin_popularity)
    test["Host_Popularity_Level"] = test["Host_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Host_Popularity_Level")
    train["Guest_Popularity_Level"] = train["Guest_Popularity_percentage"].apply(bin_popularity)
    test["Guest_Popularity_Level"] = test["Guest_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Guest_Popularity_Level")
    
    train['Has_Ads'] = train['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    test['Has_Ads'] = test['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    
    train["Ads_Per_Minutes"] = train["Episode_Length_minutes"] / (train["Number_of_Ads"]+1)
    test["Ads_Per_Minutes"] = test["Episode_Length_minutes"] / (test["Number_of_Ads"]+1)
    numerical_variables.append("Ads_Per_Minutes")

    # Categorical features
    train['Is_Weekend'] = train['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)
    test['Is_Weekend'] = test['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)

    return train, test

In [8]:
def one_hot_encode_features(train, test, categorical_features):
    # Initialize the OneHotEncoder with drop='first'
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    # Fit and transform the train set
    train_encoded = encoder.fit_transform(train[categorical_features])
    
    # Transform the test set
    test_encoded = encoder.transform(test[categorical_features])
    
    # Convert the encoded arrays to DataFrame
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_features))
    
    # Drop the original categorical columns from train and test DataFrames
    train.drop(columns=categorical_features, inplace=True)
    test.drop(columns=categorical_features, inplace=True)
    
    # Concatenate the one-hot encoded features to the original DataFrames
    train = pd.concat([train, train_encoded_df], axis=1)
    test = pd.concat([test, test_encoded_df], axis=1)
    
    return train, test


In [9]:
def standardize_features(train, test, numerical_features):
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit and transform the training data
    train_scaled = scaler.fit_transform(train[numerical_features])
    
    # Transform the test data
    test_scaled = scaler.transform(test[numerical_features])
    
    # Convert the scaled arrays back to DataFrames
    train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_features)
    test_scaled_df = pd.DataFrame(test_scaled, columns=numerical_features)
    
    # Replace the original columns with the standardized columns
    train[numerical_features] = train_scaled_df
    test[numerical_features] = test_scaled_df
    
    return train, test

In [10]:
train_data.columns

Index(['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes',
       'Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads',
       'Episode_Sentiment', 'Listening_Time_minutes'],
      dtype='object')

# Data Processing

In [11]:
# Data Processing 
train_data = noise_removal(train_data)
test_data = noise_removal(test_data)
train_data = missing_value_imputation(train_data)
test_data = missing_value_imputation(test_data)
train_data, test_data = feature_engineering(train_data, test_data)

There are 6 interaction features:
['Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads']


In [12]:
print(INTERACT)
print(train_data.shape)
print(test_data.shape)
print(categorial_valiable)
print(numerical_variables)

['Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads']
(750000, 24)
(250000, 23)
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment', 'Host_Popularity_Level', 'Guest_Popularity_Level']
['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads', 'Host_Guest_Avg_Popularity', 'Ads_Per_Minutes']


In [13]:
# Encoding 
train_data, test_data = one_hot_encode_features(train_data, test_data, categorial_valiable)

train_data, test_data = standardize_features(train_data, test_data, numerical_variables)

In [14]:
print(train_data.shape)
print(test_data.shape)

(750000, 186)
(250000, 185)


In [15]:
print(train_data.isnull().sum().sum())
print(test_data.isnull().sum().sum())

0
0


# Traning Prepration

In [16]:
train_data.columns

Index(['id', 'Episode_Length_minutes', 'Host_Popularity_percentage',
       'Guest_Popularity_percentage', 'Number_of_Ads',
       'Listening_Time_minutes',
       'Episode_Length_minutes_Host_Popularity_percentage',
       'Episode_Length_minutes_Guest_Popularity_percentage',
       'Episode_Length_minutes_Number_of_Ads',
       'Host_Popularity_percentage_Guest_Popularity_percentage',
       ...
       'Publication_Day_Wednesday', 'Publication_Time_Evening',
       'Publication_Time_Morning', 'Publication_Time_Night',
       'Episode_Sentiment_Neutral', 'Episode_Sentiment_Positive',
       'Host_Popularity_Level_Low', 'Host_Popularity_Level_Medium',
       'Guest_Popularity_Level_Low', 'Guest_Popularity_Level_Medium'],
      dtype='object', length=186)

In [17]:
X = train_data.drop(['Listening_Time_minutes', 'id'], axis=1)
y = train_data['Listening_Time_minutes']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
print(len(y_train), len(y_val))

600000 150000


# Model Optimisation 

In [19]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
def objective_liner_regression(trial):
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    copy_X = trial.suggest_categorical('copy_X', [True, False])

    clf = LinearRegression(fit_intercept=fit_intercept, copy_X=copy_X)

    # Create a custom scorer for negative RMSE
    #scorer = make_scorer(mean_squared_error, squared=False)  # RMSE
    
    #score = cross_val_score(clf,X_train, y_train, cv=cv, scoring=scorer).mean()
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    rmse_score = np.sqrt(-score)
    
    return rmse_score

study_liner_reg = optuna.create_study(direction="minimize")
study_liner_reg.optimize(objective_liner_regression, n_trials=8)
best_param_liner_reg = study_liner_reg.best_trial.params
print("Best params for RandomForest:", best_param_liner_reg)

[I 2025-04-03 19:49:43,879] A new study created in memory with name: no-name-98ea4e7c-eb24-46c0-a0dd-ea1b50c22fe7
[I 2025-04-03 19:50:22,144] Trial 0 finished with value: 20.803811243063368 and parameters: {'fit_intercept': False, 'copy_X': False}. Best is trial 0 with value: 20.803811243063368.
[I 2025-04-03 19:51:00,534] Trial 1 finished with value: 20.803811243063368 and parameters: {'fit_intercept': False, 'copy_X': True}. Best is trial 0 with value: 20.803811243063368.
[I 2025-04-03 19:51:38,948] Trial 2 finished with value: 20.803811243063368 and parameters: {'fit_intercept': False, 'copy_X': True}. Best is trial 0 with value: 20.803811243063368.
[I 2025-04-03 19:52:18,157] Trial 3 finished with value: 20.79300333293909 and parameters: {'fit_intercept': True, 'copy_X': False}. Best is trial 3 with value: 20.79300333293909.
[I 2025-04-03 19:52:56,422] Trial 4 finished with value: 20.803811243063368 and parameters: {'fit_intercept': False, 'copy_X': True}. Best is trial 3 with valu

Best params for RandomForest: {'fit_intercept': True, 'copy_X': False}


In [21]:
import optuna.visualization as vis
from IPython.display import IFrame, display

# Plot the optimization history (objective value over trials)
fig_history = vis.plot_optimization_history(study_liner_reg)
fig_history.write_html("optimization_history.html")

# Plot the parameter importances
fig_importances = vis.plot_param_importances(study_liner_reg)
fig_importances.write_html("parameter_importances.html")

# Display the saved HTML files inline using IFrame
display(IFrame(src="optimization_history.html", width="100%", height=500))
display(IFrame(src="parameter_importances.html", width="100%", height=500))

In [22]:
final_estimator = LinearRegression(**best_param_liner_reg)
final_estimator.fit(X_train, y_train)

roo_mean_square_error = cross_val_score(final_estimator, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error').mean()
print("LinearRegression CV Root mean square error:", roo_mean_square_error)

LinearRegression CV Root mean square error: -20.792848802807097


In [23]:
final_estimator.fit(X_train, y_train)
y_val_pred = final_estimator.predict(X_val)
print('Validation Room mean square error:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Validation Room mean square error: 20.76571893373446


In [24]:
X_test = test_data.drop(['id'], axis=1)

#print(X_test.shape)
test_predictions = final_estimator.predict(X_test)

In [25]:
submission.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,45.437
1,750001,45.437
2,750002,45.437
3,750003,45.437
4,750004,45.437


In [26]:
# Create submission file
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': test_predictions  # Predicted probabilities for rainfall
})

# Save to CSV
#submission_df.to_csv("submission_StackingClassifier_03_28.csv", index=False)

In [27]:
submission_df.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,45.057823
1,750001,26.053113
2,750002,45.16499
3,750003,55.114087
4,750004,44.316304


In [28]:
submission_df["Listening_Time_minutes"] = submission_df["Listening_Time_minutes"].clip(lower=0, upper=test_data["Episode_Length_minutes"])

In [29]:
submission_df.to_csv("submission_Base_Submission_04_04.csv", index=False)

In [30]:
features = X_train.columns

In [31]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train_data))
pred_xgb = np.zeros(len(test_data))

for i, (train_index, test_index) in enumerate(kf.split(train_data)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train_data.loc[train_index,features].copy()
    y_train = train_data.loc[train_index,"Listening_Time_minutes"]    
    x_valid = train_data.loc[test_index,features].copy()
    y_valid = train_data.loc[test_index,"Listening_Time_minutes"]
    x_test = test_data[features].copy()

    model = LinearRegression(**best_param_liner_reg)
    model.fit(x_train, y_train)

    # INFER OOF
    oof_xgb[test_index] = model.predict(x_valid)
    # INFER TEST
    pred_xgb += model.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

#########################
### Fold 1
#########################
#########################
### Fold 2
#########################
#########################
### Fold 3
#########################
#########################
### Fold 4
#########################
#########################
### Fold 5
#########################
CPU times: user 1min 3s, sys: 18 s, total: 1min 21s
Wall time: 1min 3s


In [32]:
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': pred_xgb  # Predicted probabilities for rainfall
})
submission_df["Listening_Time_minutes"] = submission_df["Listening_Time_minutes"].clip(lower=0, upper=test_data["Episode_Length_minutes"])
submission_df.to_csv("submission_Base_oof_04_04.csv", index=False)