In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
#load train dataset csv
df_train = pd.read_csv("/Users/sa21/Desktop/Podcast_Prediction/Data/raw/train.csv")
df_train.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [3]:
#columns names
colum_names = df_train.columns
colum_names

Index(['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes',
       'Genre', 'Host_Popularity_percentage', 'Publication_Day',
       'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads',
       'Episode_Sentiment', 'Listening_Time_minutes'],
      dtype='object')

From this EDA, we learned that episode length, ad count, and sentiment have outliers and skewness that may require scaling or transformation before modeling. The group-based median imputation approach helped retain important information without distorting the data.

In [4]:
# Impute missing values within each podcast group using the median
# handling missing values for Episode_Length_minutes
df_train['Episode_Length_minutes'] = df_train.groupby('Podcast_Name')['Episode_Length_minutes'].transform(lambda x: x.fillna(x.median()))
# handling missing values for Guest_Popularity_percentage
df_train['Guest_Popularity_percentage'] = df_train.groupby('Podcast_Name')['Guest_Popularity_percentage'].transform(lambda x: x.fillna(x.median()))
# handling missing values for Number_of_Ads
df_train['Number_of_Ads'] = df_train.groupby('Podcast_Name')['Number_of_Ads'].transform(lambda x: x.fillna(x.median()))
# Check that all missing values have been handled
df_train.isnull().sum()



id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64

In [5]:
# Convert categorical sentiment labels to numeric values for modeling
# Negative = -1, Neutral = 0, Positive = 1
sentiment_map = {'Negative': -1, 'Neutral': 0, 'Positive': 1}
df_train['Episode_Sentiment'] = df_train['Episode_Sentiment'].map(sentiment_map)

# Then one-hot encode only other categoricals:
df_train = pd.get_dummies(df_train, columns=['Genre', 'Publication_Day','Publication_Time'])

df_train = df_train.astype({col: int for col in df_train.select_dtypes(['bool']).columns})

final_df = df_train.drop(columns=['Episode_Title', 'Podcast_Name'], errors='ignore')


In [6]:
# save clean data in new csv 
final_df.to_csv('/Users/sa21/Desktop/Podcast_Prediction/Data/processed/df_train_cleaned.csv', index=False)