In [4]:
from pathlib import Path
import joblib
import datetime

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [8]:
def load_data(filename: str) -> pd.DataFrame:
    p = Path(f"./data/{filename}")
    assert p.exists()

    train_df = pd.read_csv(filepath_or_buffer=p)

    train_df["Genre"] = train_df.Genre.astype("category").cat.codes
    train_df["Episode_Sentiment"] = train_df.Episode_Sentiment.astype(
        "category"
    ).cat.codes
    train_df["Publication_Day"] = train_df.Publication_Day.astype("category").cat.codes
    train_df["Publication_Time"] = train_df.Publication_Time.astype(
        "category"
    ).cat.codes
    train_df["Episode_Title"] = train_df.Episode_Title.astype("category").cat.codes
    train_df["Podcast_Name"] = train_df.Podcast_Name.astype("category").cat.codes

    return train_df


train_df = load_data("train.csv")
test_df = load_data("test.csv")

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  int8   
 2   Episode_Title                750000 non-null  int8   
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  int8   
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  int8   
 7   Publication_Time             750000 non-null  int8   
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  int8   
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), int8(6)
memory usage: 38.6 MB


In [None]:
test_df.info()

In [None]:
train_df.corr(numeric_only=True)

In [None]:
X = train_df.drop(columns=["id", "Listening_Time_minutes"])
y = train_df["Listening_Time_minutes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [2]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

NameError: name 'RandomForestRegressor' is not defined

In [None]:
joblib.load()

In [None]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse)

In [None]:
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))

In [5]:
model_path = Path('./models/rf_simple_model_bundle.pkl')

In [None]:
model_bundle = {
    'model': model,
    'metrics': {
        'rmse': rmse,
    },
    "test_size" : 0.2,
    'metadata': {
        'trained_on': str(datetime.datetime.now()),
        'model_type': 'RandomForestRegressor',
        'features': list(X.columns),
        'target': 'Listening_Time_minutes'
    }
}

joblib.dump(model_bundle, model_path = Path('./models/rf_simple_model_bundle.pkl'))

In [6]:
model_bundle = joblib.load(model_path)

In [7]:
model_bundle

{'model': RandomForestRegressor(random_state=42),
 'metrics': {'rmse': 12.786886034745974},
 'test_size': 0.2,
 'metadata': {'trained_on': '2025-04-19 17:33:29.651741',
  'model_type': 'RandomForestRegressor',
  'features': ['id',
   'Podcast_Name',
   'Episode_Title',
   'Episode_Length_minutes',
   'Genre',
   'Host_Popularity_percentage',
   'Publication_Day',
   'Publication_Time',
   'Guest_Popularity_percentage',
   'Number_of_Ads',
   'Episode_Sentiment'],
  'target': 'Listening_Time_minutes',
  'scikit_learn_version': '1.4.2'}}