In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
import warnings 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error

import optuna

In [3]:
# Data reading 
train_data = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")

In [4]:
# Configuration 
numerical_variables = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorial_valiable = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'] 
target_variable = 'Listening_Time_minutes'

In [5]:
# Utility function 

def noise_removal(df):
    df["Number_of_Ads"] = df['Number_of_Ads'].apply(lambda x: x if x < 4 else 4)
    df["Episode_Length_minutes"] = df['Episode_Length_minutes'].apply(lambda x: x if x < 150 else 150)
    return df

def missing_value_imputation(df):
    df["Episode_Length_minutes"] = df.groupby("Podcast_Name")["Episode_Length_minutes"].transform(lambda x: x.fillna(x.mean()))
    df["Guest_Popularity_percentage"] = df.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform(lambda x: x.fillna(x.mean()))
    df["Number_of_Ads"] = df["Number_of_Ads"].fillna(2)
    return df

def bin_popularity(value):
    if value < 20:
        return 'Low'
    elif value < 60:
        return 'Medium'
    else:
        return 'High'

INTERACT = []
def feature_engineering(train, test):
    global numerical_variables
    # Numerical columns 
    ## Interaction features
    if len(INTERACT) > 0:
        INTERACT.clear()
    for i,c1 in enumerate(numerical_variables):
        for j,c2 in enumerate(numerical_variables[i+1:]):
            feature = f"{c1}_{c2}"
            train[feature] = train[c1] * train[c2]
            test[feature] = test[c1] * test[c2]
            INTERACT.append(feature)
    numerical_variables += INTERACT
    print(f"There are {len(INTERACT)} interaction features:")
    print(INTERACT)
    
    train['Host_Guest_Avg_Popularity'] = (train['Host_Popularity_percentage'] + train['Guest_Popularity_percentage']) / 2
    test['Host_Guest_Avg_Popularity'] = (test['Host_Popularity_percentage'] + test['Guest_Popularity_percentage']) / 2
    numerical_variables.append("Host_Guest_Avg_Popularity")
    
    train["Host_Popularity_Level"] = train["Host_Popularity_percentage"].apply(bin_popularity)
    test["Host_Popularity_Level"] = test["Host_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Host_Popularity_Level")
    train["Guest_Popularity_Level"] = train["Guest_Popularity_percentage"].apply(bin_popularity)
    test["Guest_Popularity_Level"] = test["Guest_Popularity_percentage"].apply(bin_popularity)
    categorial_valiable.append("Guest_Popularity_Level")
    
    train['Has_Ads'] = train['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    test['Has_Ads'] = test['Number_of_Ads'].apply(lambda x: 1 if x > 0 else 0)
    
    train["Ads_Per_Minutes"] = train["Episode_Length_minutes"] / (train["Number_of_Ads"]+1)
    test["Ads_Per_Minutes"] = test["Episode_Length_minutes"] / (test["Number_of_Ads"]+1)
    numerical_variables.append("Ads_Per_Minutes")

    # Categorical features
    train['Is_Weekend'] = train['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)
    test['Is_Weekend'] = test['Publication_Day'].apply(lambda x: 1 if x == "Saturday" or x == "Sunday" else 0)

    return train, test

In [6]:
def one_hot_encode_features(train, test, categorical_features):
    # Initialize the OneHotEncoder with drop='first'
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    # Fit and transform the train set
    train_encoded = encoder.fit_transform(train[categorical_features])
    
    # Transform the test set
    test_encoded = encoder.transform(test[categorical_features])
    
    # Convert the encoded arrays to DataFrame
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_features))
    
    # Drop the original categorical columns from train and test DataFrames
    train.drop(columns=categorical_features, inplace=True)
    test.drop(columns=categorical_features, inplace=True)
    
    # Concatenate the one-hot encoded features to the original DataFrames
    train = pd.concat([train, train_encoded_df], axis=1)
    test = pd.concat([test, test_encoded_df], axis=1)
    
    return train, test


def standardize_features(train, test, numerical_features):
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit and transform the training data
    train_scaled = scaler.fit_transform(train[numerical_features])
    
    # Transform the test data
    test_scaled = scaler.transform(test[numerical_features])
    
    # Convert the scaled arrays back to DataFrames
    train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_features)
    test_scaled_df = pd.DataFrame(test_scaled, columns=numerical_features)
    
    # Replace the original columns with the standardized columns
    train[numerical_features] = train_scaled_df
    test[numerical_features] = test_scaled_df
    
    return train, test

In [7]:
train_data.columns

Index(['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes',
       'Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads',
       'Episode_Sentiment', 'Listening_Time_minutes'],
      dtype='object')

In [8]:
# Data Processing 
train_data = noise_removal(train_data)
test_data = noise_removal(test_data)
train_data = missing_value_imputation(train_data)
test_data = missing_value_imputation(test_data)
train_data, test_data = feature_engineering(train_data, test_data)

There are 6 interaction features:
['Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads']


In [9]:
print(INTERACT)
print(train_data.shape)
print(test_data.shape)
print(categorial_valiable)
print(numerical_variables)

['Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads']
(750000, 24)
(250000, 23)
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment', 'Host_Popularity_Level', 'Guest_Popularity_Level']
['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Guest_Popularity_percentage', 'Episode_Length_minutes_Number_of_Ads', 'Host_Popularity_percentage_Guest_Popularity_percentage', 'Host_Popularity_percentage_Number_of_Ads', 'Guest_Popularity_percentage_Number_of_Ads', 'Host_Guest_Avg_Popularity', 'Ads_Per_Minutes']


In [10]:
train_data, test_data = one_hot_encode_features(train_data, test_data, categorial_valiable)

train_data, test_data = standardize_features(train_data, test_data, numerical_variables)

In [11]:
print(train_data.shape)
print(test_data.shape)

(750000, 186)
(250000, 185)


In [12]:
X = train_data.drop(['Listening_Time_minutes', 'id'], axis=1)
y = train_data['Listening_Time_minutes']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(y_train), len(y_val))

600000 150000


In [13]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
def objective_ridge(trial):
    alpha = trial.suggest_float("alpha", 1e-5, 10.0, log=True)
    solver = trial.suggest_categorical("solver", ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'])

    # Handle 'lbfgs' solver to set positive=True
    if solver == 'lbfgs':
        clf = Ridge(alpha=alpha, solver=solver, positive=True, random_state=42)
    else:
        clf = Ridge(alpha=alpha, solver=solver, random_state=42)

    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring='neg_mean_squared_error').mean()
    rmse_score = np.sqrt(-score)
    
    return rmse_score

study_ridge = optuna.create_study(direction="minimize")
study_ridge.optimize(objective_ridge, n_trials=50)
best_param_ridge = study_ridge.best_trial.params
print("Best params for Ridge:", best_param_ridge)

[I 2025-04-04 09:36:14,380] A new study created in memory with name: no-name-d2f1ab98-f0ed-4f3f-8d83-3cddb4414c24
[I 2025-04-04 09:45:25,469] Trial 0 finished with value: 20.792919358419006 and parameters: {'alpha': 1.747104854553725e-05, 'solver': 'sag'}. Best is trial 0 with value: 20.792919358419006.
[I 2025-04-04 09:45:34,329] Trial 1 finished with value: 20.79291955287596 and parameters: {'alpha': 0.003049533894394569, 'solver': 'cholesky'}. Best is trial 0 with value: 20.792919358419006.
[I 2025-04-04 09:45:42,983] Trial 2 finished with value: 20.792919609780906 and parameters: {'alpha': 0.00036682736056467893, 'solver': 'cholesky'}. Best is trial 0 with value: 20.792919358419006.
[I 2025-04-04 09:45:59,732] Trial 3 finished with value: 20.869592293844974 and parameters: {'alpha': 0.02921341196730695, 'solver': 'lbfgs'}. Best is trial 0 with value: 20.792919358419006.
[I 2025-04-04 09:46:16,599] Trial 4 finished with value: 20.869601794598466 and parameters: {'alpha': 3.126586677

Best params for Ridge: {'alpha': 9.914691181288683, 'solver': 'sag'}


In [15]:
import optuna.visualization as vis
from IPython.display import IFrame, display

# Plot the optimization history (objective value over trials)
fig_history = vis.plot_optimization_history(study_ridge)
fig_history.write_html("optimization_history.html")

# Plot the parameter importances
fig_importances = vis.plot_param_importances(study_ridge)
fig_importances.write_html("parameter_importances.html")

# Display the saved HTML files inline using IFrame
display(IFrame(src="optimization_history.html", width="100%", height=500))
display(IFrame(src="parameter_importances.html", width="100%", height=500))

In [16]:
final_estimator = Ridge(random_state=42, **best_param_ridge)
final_estimator.fit(X_train, y_train)

root_mean_square_error = cross_val_score(final_estimator, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error').mean()
print("Ridge CV Root mean square error:", root_mean_square_error)

Ridge CV Root mean square error: -20.79277949381487


In [17]:
final_estimator.fit(X_train, y_train)
y_val_pred = final_estimator.predict(X_val)
print('Ridge validation Room mean square error:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

Ridge validation Room mean square error: 20.765655996893777


In [19]:
X_test = test_data.drop(['id'], axis=1)

#print(X_test.shape)
final_estimator.fit(X, y)
test_predictions = final_estimator.predict(X_test)

In [20]:
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Listening_Time_minutes': test_predictions  # Predicted probabilities for rainfall
})

submission_df["Listening_Time_minutes"] = submission_df["Listening_Time_minutes"].clip(
    lower=0, upper=test_data["Episode_Length_minutes"])

submission_df.to_csv("submission_Ridge_04_04_v2.csv", index=False)