In [81]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [65]:
path = "D:/Predict Podcast Listening Time ML/"
train= pd.read_csv(path + "train.csv")
test=pd.read_csv(path + "test.csv")
sample_submission=pd.read_csv(path + "sample_submission.csv")

In [66]:
#Preview the train file
print("\nTrain Data:")
print(train.head())
print("\nTest Data:")
print(test.head())
print("\nTrain info:")
train.info()
print("\nTest info:")
test.info() 
print("\nTrain Description:")
print(train.describe())
print("\nTest Description:")
print(test.describe())



Train Data:
   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre  \
0   0  Mystery Matters    Episode 98                     NaN  True Crime   
1   1    Joke Junction    Episode 26                  119.80      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   
3   3   Digital Digest    Episode 45                   67.17  Technology   
4   4      Mind & Body    Episode 86                  110.51      Health   

   Host_Popularity_percentage Publication_Day Publication_Time  \
0                       74.81        Thursday            Night   
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   
3                       57.22          Monday          Morning   
4                       80.07          Monday        Afternoon   

   Guest_Popularity_percentage  Number_of_Ads Episode_Sentiment  \
0                          NaN            0.0          Positive   

In [67]:
#Calculate the median of the column that have missing values in the training dataset 
len_median=train['Episode_Length_minutes'].median()
guest_median=train['Guest_Popularity_percentage'].median()
ads_median=train['Number_of_Ads'].median()

print(f"Medians for replacing missing values ➜ Episode length: {len_median:.2f}, Guest popularity: {guest_median:.2f}, Ads: {ads_median:.2f}")

#Filling the missing values in the training dataset
train['Episode_Length_minutes'].fillna(len_median, inplace=True)
train['Guest_Popularity_percentage'].fillna(guest_median, inplace=True)
train['Number_of_Ads'].fillna(ads_median, inplace=True)

#Filling the missing values in the test dataset using the medians from the training dataset
#Because train datset is used to train the model 
test['Episode_Length_minutes'].fillna(len_median, inplace=True)
test['Guest_Popularity_percentage'].fillna(guest_median, inplace=True)

#Check for the missing values in the training dataset and the test dataset
print("\nMissing values in the training dataset:")
print(train.isnull().sum())
print("\nMissing values in the test dataset:")
print(test.isnull().sum())  


Medians for replacing missing values ➜ Episode length: 63.84, Guest popularity: 53.58, Ads: 1.00

Missing values in the training dataset:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Episode_Length_minutes'].fillna(len_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Guest_Popularity_percentage'].fillna(guest_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64

Missing values in the test dataset:
id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64


In [68]:
#Convert the categorical variables to numerical variables
#Map Prediction_Time to numerical values
time_map={
    'Morning': 9,
    'Afternoon': 15, 
    'Evening': 19,
    'Night': 22
}

train['Publication_Hour'] = train['Publication_Time'].map(time_map)
test['Publication_Hour'] = test['Publication_Time'].map(time_map)
#Map publication day to numerical values
day_map={
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
train['Publication_Day_Num'] =train['Publication_Day'].map(day_map)
test['Publication_Day_Num'] =test['Publication_Day'].map(day_map)

#Drop unnecessary columns(Publication_Time, Publication_Day)
train.drop(['Publication_Time', 'Publication_Day'], axis=1, inplace=True)
test.drop(['Publication_Time', 'Publication_Day'], axis=1, inplace=True)


In [69]:
train.info()
test.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       750000 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Guest_Popularity_percentage  750000 non-null  float64
 7   Number_of_Ads                750000 non-null  float64
 8   Episode_Sentiment            750000 non-null  object 
 9   Listening_Time_minutes       750000 non-null  float64
 10  Publication_Hour             750000 non-null  int64  
 11  Publication_Day_Num          750000 non-null  int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 68.7+ MB


In [70]:
# Create label encoders
genre_encoder = LabelEncoder()
sentiment_encoder = LabelEncoder()

# Fit on training and apply to both train and test
train['Genre'] = genre_encoder.fit_transform(train['Genre'])
test['Genre'] = genre_encoder.transform(test['Genre'])

train['Episode_Sentiment'] = sentiment_encoder.fit_transform(train['Episode_Sentiment'])
test['Episode_Sentiment'] = sentiment_encoder.transform(test['Episode_Sentiment'])

In [71]:
columns_to_drop = ['id', 'Podcast_Name', 'Episode_Title']

train.drop(columns=columns_to_drop, axis=1, inplace=True)
test.drop(columns=columns_to_drop, axis=1, inplace=True)


In [72]:
print(train.info())
print(test.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Episode_Length_minutes       750000 non-null  float64
 1   Genre                        750000 non-null  int64  
 2   Host_Popularity_percentage   750000 non-null  float64
 3   Guest_Popularity_percentage  750000 non-null  float64
 4   Number_of_Ads                750000 non-null  float64
 5   Episode_Sentiment            750000 non-null  int64  
 6   Listening_Time_minutes       750000 non-null  float64
 7   Publication_Hour             750000 non-null  int64  
 8   Publication_Day_Num          750000 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 51.5 MB
None
   Episode_Length_minutes  Genre  Host_Popularity_percentage  \
0                   78.96      2                       38.11   
1                   27.87      5                       71

In [73]:
# List columns to scale
columns_to_scale = [
    'Episode_Length_minutes',
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage',
    'Number_of_Ads',
    'Publication_Hour',
    'Publication_Day_Num'
]

# Initialize scaler
scaler = StandardScaler()

# Fit scaler on training data and transform
train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

# Transform test data with the same scaler
test[columns_to_scale] = scaler.transform(test[columns_to_scale])

In [74]:
print(train[columns_to_scale].mean())
print(train[columns_to_scale].std())



Episode_Length_minutes         4.634918e-16
Host_Popularity_percentage    -6.141552e-16
Guest_Popularity_percentage   -9.439797e-17
Number_of_Ads                 -1.552110e-16
Publication_Hour               3.294266e-16
Publication_Day_Num           -3.172810e-17
dtype: float64
Episode_Length_minutes         1.000001
Host_Popularity_percentage     1.000001
Guest_Popularity_percentage    1.000001
Number_of_Ads                  1.000001
Publication_Hour               1.000001
Publication_Day_Num            1.000001
dtype: float64


In [75]:
X_train = train.drop('Listening_Time_minutes', axis=1)
y_train = train['Listening_Time_minutes']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)




In [76]:
train_data = lgb.Dataset(X_train, label=y_train)  # Training part
val_data = lgb.Dataset(X_val, label=y_val)        # Validation part


In [77]:
params = {
    'objective': 'regression',        # We're predicting a number (Listening time)
    'metric': 'rmse',                 # We'll measure error using RMSE
    'boosting_type': 'gbdt',          # Gradient Boosted Trees (default method)
    'learning_rate': 0.1,             # How fast it learns (smaller = slower but better)
    'num_leaves': 31,                 # Complexity of the trees (start simple)
    'verbose': -1,                    # Don't print too much stuff
    'random_state': 42                # For consistent results
}


In [83]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[early_stopping(50), log_evaluation(100)]
)


Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 13.0536	valid's rmse: 13.0729
[200]	train's rmse: 12.9628	valid's rmse: 13.0495
[300]	train's rmse: 12.8904	valid's rmse: 13.0328
[400]	train's rmse: 12.8276	valid's rmse: 13.0228
[500]	train's rmse: 12.7691	valid's rmse: 13.0122
[600]	train's rmse: 12.7139	valid's rmse: 13.0033
[700]	train's rmse: 12.6645	valid's rmse: 12.9971
[800]	train's rmse: 12.6168	valid's rmse: 12.991
[900]	train's rmse: 12.5733	valid's rmse: 12.9854
[1000]	train's rmse: 12.5316	valid's rmse: 12.9816
Did not meet early stopping. Best iteration is:
[996]	train's rmse: 12.5339	valid's rmse: 12.9813


In [84]:
# This assumes your test set is already preprocessed and scaled just like train
X_test = test.copy()


In [85]:
# 1. Predict on test set using the trained model
y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [86]:
# 2. Load the sample submission file
submission = pd.read_csv('sample_submission.csv')

In [89]:
# 2. Load the sample submission file
submission = pd.read_csv('sample_submission.csv')


In [90]:
# 3. Insert your predictions into the correct column
submission['Listening_Time_minutes'] = y_test_pred

In [91]:
# 4. Save your submission file (name it meaningfully)
submission.to_csv('submission_lgb.csv', index=False)