In [705]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import (
    PowerTransformer,
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Ridge, SGDRegressor,ElasticNet, Lasso
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import re
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from ydata_profiling import ProfileReport
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

In [706]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")

In [707]:
df = df[(df['Listening_Time_minutes'] != 0)]
df['Listening_Time_minutes'] = np.log1p(df['Listening_Time_minutes'])

In [708]:
def gen_eda():
    profile = ProfileReport(
        pd.concat([df], axis=1),
        title="Pandas Profiling Report",
        explorative=True,
    )
    profile.to_file("pandas_profiling_report.html")


# gen_eda()

In [709]:
df.sample(5)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
133103,133103,Game Day,Episode 72,103.58,Sports,40.58,Sunday,Morning,65.79,0.0,Positive,4.216767
281957,281957,Study Sessions,Episode 53,59.91,Education,32.72,Friday,Afternoon,,1.0,Negative,3.605093
679566,679566,Gadget Geek,Episode 4,41.49,Technology,95.92,Sunday,Morning,95.75,0.0,Negative,3.652114
519158,519158,Fitness First,Episode 87,14.27,Health,85.03,Friday,Night,9.48,0.0,Neutral,2.418623
495171,495171,Criminal Minds,Episode 53,65.04,True Crime,49.62,Tuesday,Evening,68.0,0.0,Neutral,3.415718


In [710]:
df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,741449.0,655066.0,741449.0,598546.0,741448.0,741449.0
mean,374957.717282,65.185957,59.836673,52.328988,1.348355,3.619211
std,216507.054088,32.568739,22.874244,28.493094,1.15267,0.777568
min,0.0,0.0,1.3,0.0,0.0,0.00056
25%,187430.0,36.73,39.38,28.34,0.0,3.213869
50%,374926.0,64.42,60.02,53.78,1.0,3.802549
75%,562461.0,94.33,79.5,76.76,2.0,4.190771
max,749999.0,325.24,119.46,119.91,103.91,4.795543


In [711]:
df = df.dropna(subset=[
    "Episode_Length_minutes",
    # "Guest_Popularity_percentage"
])
print(df["Episode_Length_minutes"].isnull().sum(), df["Guest_Popularity_percentage"].isnull().sum())

0 120984


In [712]:
def remove_outliers(df, outlier_dict):
    for distribution, category in outlier_dict.items():
        if distribution == "normal":
            for cat in category:
                upper_limit = df[cat].mean() + 3 * df[cat].std()
                lower_limit = df[cat].mean() - 3 * df[cat].std()
                print(cat, upper_limit, lower_limit)
                # capping
                # df[cat] = np.where(df[cat] > upper_limit,upper_limit,np.where(df[cat] < lower_limit, lower_limit, df[cat]))
                # Trimming
                df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
        elif distribution == "skew":
            for cat in category:
                percentile25 = df[cat].quantile(0.25)
                percentile75 = df[cat].quantile(0.75)
                iqr = percentile75 - percentile25
                upper_limit = percentile75 + 1.5 * iqr
                lower_limit = percentile25 - 1.5 * iqr
                print(cat, upper_limit, lower_limit)
                # capping
                # df[cat] = np.where(
                #     df[cat] > upper_limit,
                #     upper_limit,
                #     np.where(df[cat] < lower_limit, lower_limit, df[cat]),
                # )
                # Trimming
                df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
    return df

In [713]:
outlier_dict = {
    "normal": [],
    "skew": [],
}


def pre_process(df):
    df["Publication_Day"] = (
        df["Publication_Day"]
        .map(
            { "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6 }
        )
        .fillna(0)
        .astype(int)
    )
    df["Publication_Time"] = (
        df["Publication_Time"]
        .map({"Morning": 0, "Afternoon": 1, "Evening": 2, "Night": 3})
        .fillna(0)
        .astype(int)
    )
    df["Episode_Sentiment"] = (
        df["Episode_Sentiment"]
        .map({"Negative": 0, "Neutral": 1, "Positive": 2})
        .fillna(0)
        .astype(int)
    )
    return df


df = pre_process(df)
df = remove_outliers(df, outlier_dict)

In [714]:
df['Number_of_Episodes'] = df.groupby(['Podcast_Name', 'Genre'])['Podcast_Name'].transform('count')
df.to_csv("df.csv", index=False)

In [715]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["id", "Listening_Time_minutes", "Episode_Title"])
    Y = df["Listening_Time_minutes"]
    return X, Y


X, Y = get_X_Y(df)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=5
)
print(X_train.shape)

(524052, 10)


In [716]:
# Get the list of categorical column names
categories_order = {
    "Publication_Day": sorted(list(df["Publication_Day"].unique())),
    "Publication_Time": sorted(list(df["Publication_Time"].unique())),
    "Episode_Sentiment": sorted(list(df["Episode_Sentiment"].unique())),
}
categorical_feat_ord = list(categories_order.keys())
categorical_feat_nom = [ "Podcast_Name", "Genre"]
categorical = categorical_feat_nom + categorical_feat_ord
numerical_features = [col for col in X_train.columns if col not in categorical]
print('numerical_features', numerical_features)
print('categorical_feat_nom', categorical_feat_nom)
print('categorical_feat_ord', categorical_feat_ord)

numerical_features ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Number_of_Episodes']
categorical_feat_nom ['Podcast_Name', 'Genre']
categorical_feat_ord ['Publication_Day', 'Publication_Time', 'Episode_Sentiment']


In [717]:
# Separate transformers for categorical and numerical features

# trf = FunctionTransformer(np.log1p, validate=True)
trf = PowerTransformer()
# trf = FunctionTransformer(np.sqrt, validate=True)
# trf = FunctionTransformer(np.sin)
# trf = StandardScaler()
# trf = MinMaxScaler()
# Add Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)


numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")), 
        ("poly", poly),
        ("log", trf),
    ]
)
categorical_transformer_onehot = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_transformer_ordinal = Pipeline(
    steps=[
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

In [718]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer_onehot, categorical_feat_nom),
        ("cat_1", categorical_transformer_ordinal, categorical_feat_ord),
        ("num", numerical_transformer, numerical_features),
    ]
)

In [719]:
# # Calculate the correlation matrix
# correlation_matrix = df.corr()

# # Save the correlation matrix to a CSV file
# correlation_matrix.to_csv('correlation_matrix.csv', index=True)

In [None]:
def adjusted_r2_score(pipeline):
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    r2 = r2_score(Y_test, y_pred)
    n = len(Y_test)  # number of samples
    k = X_train.shape[1]  # number of features
    adj_r2 = 1 - (1 - r2) * ((n - 1) / (n - k - 1))
    print(f"R² Score: {r2:.4f}")
    print(f"Adjusted R² Score: {adj_r2:.4f}")

model = Ridge(alpha=0.1)
# model = XGBRegressor(objective='reg:squarederror',n_estimators=100,learning_rate=0.1,max_depth=3,random_state=42)
model = LGBMRegressor(objective='regression',n_estimators=100,learning_rate=0.1,max_depth=-1,random_state=42)
pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])
pipeline.fit(X_train, Y_train)
adjusted_r2_score(pipeline)

Mean Squared Error: 0.0780902479340377
Root Mean Squared Error: 0.2794463238871424
R² Score: 0.8721
Adjusted R² Score: 0.8721


In [721]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [722]:
# Define the columns expected by the model
column_names = X_train.columns

def test_preprocess(df):
    episode_counts_mapping = df.groupby(['Podcast_Name', 'Genre']).size().to_dict()
    df['Number_of_Episodes'] = df.apply(lambda row: episode_counts_mapping.get((row['Podcast_Name'], row['Genre']), 0), axis=1)
    return df

def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace("", np.nan, inplace=True)
    df = pre_process(df)
    df = test_preprocess(df)
    # Select the relevant columns
    filtered_df = df[column_names]
    predictions = pipeline.predict(filtered_df)
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df["Listening_Time_minutes"] = predictions
    original_df["Listening_Time_minutes"] = np.expm1(
        original_df["Listening_Time_minutes"]
    )
    # Save the results to a new CSV file
    submission_df = original_df[["id", "Listening_Time_minutes"]]
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = "test.csv"
generate_submission(test_file)

Submission file saved as 'submission.csv'
