In [1]:
from IPython.display import display
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor

## Reading csv files

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
sample_submission_df = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")

## Details of the data tables

In [3]:
def df_details(df):
    print("------------------------------------------------")
    print("df shape :", df.shape)
    print("------------------------------------------------")

    print("\n------------------------------------------------")
    print("df top 5 data :")
    print("------------------------------------------------")
    display(df.head(5))


    print("\n\n------------------------------------------------")
    print("df info :")
    print("------------------------------------------------")
    display(df.info())


    print("\n\n------------------------------------------------")
    print("df describe numeric data :")
    print("------------------------------------------------")
    display(df.describe())

    obs_cols = df.select_dtypes(include='object').columns
    if len(obs_cols) > 0:
        print("\n\n------------------------------------------------")
        print("df describe object data :")
        print("------------------------------------------------")
        display(df.describe(include=object))
    else:
        print("\n\n------------------------------------------------")
        print("No object data available :")
        print("------------------------------------------------")

    
    print("\n\n------------------------------------------------")
    print("Missing Values :")
    print("------------------------------------------------")
    print( df.isnull().sum()[df.isnull().sum() > 0] )
    
    
    missing_percentage = (df.isnull().sum() / len(df)) * 100 
    print("\n\n------------------------------------------------")
    print("Percentage of Missing values: (%) ")
    print("------------------------------------------------")
    print(missing_percentage[missing_percentage > 0])
    

    
    total_missing_percentage = (df.isnull().sum().sum() / (df.size)) * 100
    print("\n\n------------------------------------------------")
    print(f"Total missing values percentage: {total_missing_percentage:.2f}%")
    print("------------------------------------------------")

In [4]:
print("\n****************************************************")
print("Details of train_df: ")
print("\n****************************************************")
df_details(train_df)

print("\n\n\n\n****************************************************")
print("Details of test_df: ")
print("\n****************************************************")
df_details(test_df)


****************************************************
Details of train_df: 

****************************************************
------------------------------------------------
df shape : (750000, 12)
------------------------------------------------

------------------------------------------------
df top 5 data :
------------------------------------------------


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031




------------------------------------------------
df info :
------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listeni

None



------------------------------------------------
df describe numeric data :
------------------------------------------------


Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97




------------------------------------------------
df describe object data :
------------------------------------------------


Unnamed: 0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Sentiment
count,750000,750000,750000,750000,750000,750000
unique,48,100,10,7,4,3
top,Tech Talks,Episode 71,Sports,Sunday,Night,Neutral
freq,22847,10515,87606,115946,196849,251291




------------------------------------------------
Missing Values :
------------------------------------------------
Episode_Length_minutes          87093
Guest_Popularity_percentage    146030
Number_of_Ads                       1
dtype: int64


------------------------------------------------
Percentage of Missing values: (%) 
------------------------------------------------
Episode_Length_minutes         11.612400
Guest_Popularity_percentage    19.470667
Number_of_Ads                   0.000133
dtype: float64


------------------------------------------------
Total missing values percentage: 2.59%
------------------------------------------------




****************************************************
Details of test_df: 

****************************************************
------------------------------------------------
df shape : (250000, 11)
------------------------------------------------

------------------------------------------------
df top 5 data :
------------------------

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,750000,Educational Nuggets,Episode 73,78.96,Education,38.11,Saturday,Evening,53.33,1.0,Neutral
1,750001,Sound Waves,Episode 23,27.87,Music,71.29,Sunday,Morning,,0.0,Neutral
2,750002,Joke Junction,Episode 11,69.1,Comedy,67.89,Friday,Evening,97.51,0.0,Positive
3,750003,Comedy Corner,Episode 73,115.39,Comedy,23.4,Sunday,Morning,51.75,2.0,Positive
4,750004,Life Lessons,Episode 50,72.32,Lifestyle,58.1,Wednesday,Morning,11.3,2.0,Neutral




------------------------------------------------
df info :
------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           250000 non-null  int64  
 1   Podcast_Name                 250000 non-null  object 
 2   Episode_Title                250000 non-null  object 
 3   Episode_Length_minutes       221264 non-null  float64
 4   Genre                        250000 non-null  object 
 5   Host_Popularity_percentage   250000 non-null  float64
 6   Publication_Day              250000 non-null  object 
 7   Publication_Time             250000 non-null  object 
 8   Guest_Popularity_percentage  201168 non-null  float64
 9   Number_of_Ads                250000 non-null  float64
 10  Episode_Sentiment            250000 non-null  object 
dtypes: floa

None



------------------------------------------------
df describe numeric data :
------------------------------------------------


Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads
count,250000.0,221264.0,250000.0,201168.0,250000.0
mean,874999.5,419.2987,59.716491,52.192796,1.355852
std,72168.927986,166854.5,22.880028,28.445034,4.274399
min,750000.0,2.47,2.49,0.0,0.0
25%,812499.75,35.78,39.25,28.32,0.0
50%,874999.5,63.97,59.9,53.36,1.0
75%,937499.25,94.15,79.39,76.56,2.0
max,999999.0,78486260.0,117.76,116.82,2063.0




------------------------------------------------
df describe object data :
------------------------------------------------


Unnamed: 0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Sentiment
count,250000,250000,250000,250000,250000,250000
unique,48,100,10,7,4,3
top,Tech Talks,Episode 71,Sports,Sunday,Night,Neutral
freq,7553,3492,28903,38906,65440,83671




------------------------------------------------
Missing Values :
------------------------------------------------
Episode_Length_minutes         28736
Guest_Popularity_percentage    48832
dtype: int64


------------------------------------------------
Percentage of Missing values: (%) 
------------------------------------------------
Episode_Length_minutes         11.4944
Guest_Popularity_percentage    19.5328
dtype: float64


------------------------------------------------
Total missing values percentage: 2.82%
------------------------------------------------


## Handling missing values and objects fields

### One Hot encoding

In [5]:
def encoding(X_train, X_valid, test):
    cat_cols = X_train.select_dtypes(include='object').columns.tolist()
    print("\n------------------------------------------------")
    print(f"Categorical columns: {cat_cols}")
    print("------------------------------------------------")

    # Fill missing values
    for df in [X_train, X_valid, test]:
        df[cat_cols] = df[cat_cols].fillna('missing')

    # Split columns by number of unique values in X_train
    onehot_cols = [col for col in cat_cols if X_train[col].nunique() <= 10]
    ordinal_cols = [col for col in cat_cols if X_train[col].nunique() > 10]

    # Encoded datasets, dropiong the categorical columns as we will encode it
    X_train_encoded = X_train.drop(cat_cols, axis=1).reset_index(drop=True)
    X_valid_encoded = X_valid.drop(cat_cols, axis=1).reset_index(drop=True)
    test_encoded = test.drop(cat_cols, axis=1).reset_index(drop=True)

    # One-Hot Encoding for categorical cols that have less than 11 unique values
    if onehot_cols:
        onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        onehot_encoder.fit(X_train[onehot_cols])

        cols = onehot_encoder.get_feature_names_out(onehot_cols)

        X_train_1hot = pd.DataFrame(onehot_encoder.transform(X_train[onehot_cols]), columns=cols)
        X_valid_1hot = pd.DataFrame(onehot_encoder.transform(X_valid[onehot_cols]), columns=cols)
        test_1hot = pd.DataFrame(onehot_encoder.transform(test[onehot_cols]), columns=cols)

        X_train_encoded = pd.concat([X_train_encoded, X_train_1hot], axis=1)
        X_valid_encoded = pd.concat([X_valid_encoded, X_valid_1hot], axis=1)
        test_encoded = pd.concat([test_encoded, test_1hot], axis=1)

    # Ordinal Encoding for categorical cols that have more than 10 unique values
    if ordinal_cols:
        ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        ordinal_encoder.fit(X_train[ordinal_cols])

        X_train_ord = pd.DataFrame(ordinal_encoder.transform(X_train[ordinal_cols]), columns=ordinal_cols)
        X_valid_ord = pd.DataFrame(ordinal_encoder.transform(X_valid[ordinal_cols]), columns=ordinal_cols)
        test_ord = pd.DataFrame(ordinal_encoder.transform(test[ordinal_cols]), columns=ordinal_cols)

        X_train_encoded = pd.concat([X_train_encoded, X_train_ord], axis=1)
        X_valid_encoded = pd.concat([X_valid_encoded, X_valid_ord], axis=1)
        test_encoded = pd.concat([test_encoded, test_ord], axis=1)

    return X_train_encoded, X_valid_encoded, test_encoded

### Imputation

In [6]:
def imputation(X_train, X_valid, test, strategy='mean'):
    my_imputer = SimpleImputer(strategy=strategy)
    
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
    imputed_test = pd.DataFrame(my_imputer.transform(test))
    
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    imputed_test.columns = test.columns
    
    return imputed_X_train, imputed_X_valid, imputed_test

### Feature Scaling

In [7]:
def scaling(X_train, X_valid, test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    test_scaled = scaler.transform(test)

    X_train_data = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_valid_data = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
    test_data = pd.DataFrame(test_scaled, columns=test.columns)
    
    return X_train_data, X_valid_data, test_data

### Spliting dataset

In [8]:
y = train_df.Listening_Time_minutes
X = train_df.drop(['Listening_Time_minutes', 'id'], axis=1)
test_df = test_df.drop(['id'], axis=1)

print("X shape : ", X.shape)
print("y shape : ",y.shape)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)
print("X_train shape : ",X_train.shape)
print("y_train shape : ",y_train.shape)
print("X_valid shape : ",X_valid.shape)
print("y_valid shape : ",y_valid.shape)

X shape :  (750000, 10)
y shape :  (750000,)
X_train shape :  (600000, 10)
y_train shape :  (600000,)
X_valid shape :  (150000, 10)
y_valid shape :  (150000,)


In [9]:
print("Categorical Encoding...")
X_train , X_valid, test_df = encoding(X_train, X_valid, test_df)
print("Numerical imputation...")
X_train , X_valid, test_df = imputation(X_train, X_valid, test_df)
print("Feature Scaling...")
X_train , X_valid, test_df = scaling(X_train, X_valid, test_df)

print("X_train shape: ", X_train.shape)
print("X_valid shape: ", X_valid.shape)
print("test_df shape: ", test_df.shape)

Categorical Encoding...

------------------------------------------------
Categorical columns: ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
------------------------------------------------
Numerical imputation...
Feature Scaling...
X_train shape:  (600000, 30)
X_valid shape:  (150000, 30)
test_df shape:  (250000, 30)


In [10]:
print("\n****************************************************")
print("Details of X_train: ")
print("\n****************************************************")
df_details(X_train)

print("\n****************************************************")
print("Details of X_valid: ")
print("\n****************************************************")
df_details(X_valid)

print("\n****************************************************")
print("Details of test_df: ")
print("\n****************************************************")
df_details(test_df)


****************************************************
Details of X_train: 

****************************************************
------------------------------------------------
df shape : (600000, 30)
------------------------------------------------

------------------------------------------------
df top 5 data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
0,0.098693,0.019398,0.0,-0.302721,-0.347041,-0.349034,-0.264859,3.083485,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,1.673548,1.414155,-0.709996,-0.704192,-0.673329,0.984346
1,-0.984701,-0.051004,-0.1758,1.434609,-0.347041,2.865053,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,1.795595,-0.597533,1.414155,-0.709996,-0.704192,-0.107437,0.522993
2,-0.625182,-1.329178,0.0,1.434609,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,-0.597533,-0.707136,-0.709996,1.420067,1.095084,1.481189
3,0.617819,0.174196,0.0,1.434609,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,-0.597533,1.414155,-0.709996,-0.704192,1.448767,-1.180468
4,1.121469,-1.491409,-0.948581,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,2.845082,-0.302454,...,-0.409669,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,1.408458,-0.704192,0.316982,-0.577159




------------------------------------------------
df info :
------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Episode_Length_minutes       600000 non-null  float64
 1   Host_Popularity_percentage   600000 non-null  float64
 2   Guest_Popularity_percentage  600000 non-null  float64
 3   Number_of_Ads                600000 non-null  float64
 4   Genre_Business               600000 non-null  float64
 5   Genre_Comedy                 600000 non-null  float64
 6   Genre_Education              600000 non-null  float64
 7   Genre_Health                 600000 non-null  float64
 8   Genre_Lifestyle              600000 non-null  float64
 9   Genre_Music                  600000 non-null  float64
 10  Genre_News                   600000 non-null  float64
 11  Genre_S

None



------------------------------------------------
df describe numeric data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,...,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,3.197442e-16,3.479765e-16,4.2253610000000006e-17,-9.75812e-18,-1.44477e-17,3.8843e-17,6.825947000000001e-17,4.471682e-17,1.875833e-17,1.0610770000000001e-17,...,-5.674868e-17,-1.800042e-18,1.392664e-17,-3.8748260000000006e-17,7.778074e-17,-2.169524e-17,-7.654914e-17,-2.7948010000000004e-17,-7.384908000000001e-17,3.8464050000000004e-17
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-2.080993,-2.561437,-2.045671,-1.171386,-0.3470414,-0.3490337,-0.2648587,-0.3243084,-0.3514837,-0.3024544,...,-0.409669,-0.5612172,-0.5937323,-0.5569184,-0.5975328,-0.7071359,-0.7099964,-0.704192,-1.66364,-1.783777
25%,-0.8096171,-0.8932082,-0.6920321,-1.171386,-0.3470414,-0.3490337,-0.2648587,-0.3243084,-0.3514837,-0.3024544,...,-0.409669,-0.5612172,-0.5937323,-0.5569184,-0.5975328,-0.7071359,-0.7099964,-0.704192,-0.8855386,-0.8610693
50%,0.0,0.008466391,0.0,-0.3027211,-0.3470414,-0.3490337,-0.2648587,-0.3243084,-0.3514837,-0.3024544,...,-0.409669,-0.5612172,-0.5937323,-0.5569184,-0.5975328,-0.7071359,-0.7099964,-0.704192,-0.03670023,0.02614984
75%,0.8341757,0.8598536,0.7368093,0.5659439,-0.3470414,-0.3490337,-0.2648587,-0.3243084,-0.3514837,-0.3024544,...,-0.409669,-0.5612172,1.684261,-0.5569184,1.673548,1.414155,1.408458,1.420067,0.9536112,0.8423914
max,8.406005,2.605482,2.48291,89.0916,2.881501,2.865053,3.775598,3.083485,2.845082,3.306284,...,2.440995,1.781841,1.684261,1.795595,1.673548,1.414155,1.408458,1.420067,1.660976,1.729611




------------------------------------------------
No object data available :
------------------------------------------------


------------------------------------------------
Missing Values :
------------------------------------------------
Series([], dtype: int64)


------------------------------------------------
Percentage of Missing values: (%) 
------------------------------------------------
Series([], dtype: float64)


------------------------------------------------
Total missing values percentage: 0.00%
------------------------------------------------

****************************************************
Details of X_valid: 

****************************************************
------------------------------------------------
df shape : (150000, 30)
------------------------------------------------

------------------------------------------------
df top 5 data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
0,-1.093363,-1.135462,-0.247478,-0.302721,2.881501,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,2.440995,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,1.408458,-0.704192,-0.744066,-0.967536
1,0.0,0.461053,-0.790735,1.434609,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,1.781841,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,1.420067,-0.0367,-0.967536
2,1.069556,-1.627404,-1.899184,1.434609,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,-0.597533,-0.707136,-0.709996,1.420067,-1.097748,-0.364227
3,-0.271467,-1.476105,-1.504372,-1.171386,2.881501,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,1.673548,-0.707136,-0.709996,1.420067,-1.522167,1.481189
4,0.368574,1.24335,0.0,-1.171386,-0.347041,-0.349034,-0.264859,3.083485,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,1.408458,-0.704192,-0.24891,0.984346




------------------------------------------------
df info :
------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Episode_Length_minutes       150000 non-null  float64
 1   Host_Popularity_percentage   150000 non-null  float64
 2   Guest_Popularity_percentage  150000 non-null  float64
 3   Number_of_Ads                150000 non-null  float64
 4   Genre_Business               150000 non-null  float64
 5   Genre_Comedy                 150000 non-null  float64
 6   Genre_Education              150000 non-null  float64
 7   Genre_Health                 150000 non-null  float64
 8   Genre_Lifestyle              150000 non-null  float64
 9   Genre_Music                  150000 non-null  float64
 10  Genre_News                   150000 non-null  float64
 11  Genre_S

None



------------------------------------------------
df describe numeric data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,...,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,-0.004907,-0.003604,0.001271,0.001585,-0.002104,0.000145,-0.001717,0.000931,-0.000139,-0.002785,...,0.001967,-0.002851,0.004545,0.005693,-0.007264,0.001446,-0.000992,-0.000453,-0.001096,0.004055
std,0.997186,1.000986,1.000085,0.999729,0.997331,1.000185,0.996983,1.001287,0.999831,0.995808,...,1.001997,0.998258,1.002468,1.003507,0.996061,1.000513,0.999656,0.999841,1.000215,1.001545
min,-2.041011,-2.535638,-2.04528,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-1.66364,-1.783777
25%,-0.810262,-0.901079,-0.692424,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-0.885539,-0.861069
50%,0.0,0.002782,0.0,-0.302721,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-0.0367,0.061639
75%,0.821601,0.855481,0.737984,0.565944,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,1.673548,1.414155,1.408458,1.420067,0.953611,0.87788
max,1.808908,2.409142,2.65094,88.95261,2.881501,2.865053,3.775598,3.083485,2.845082,3.306284,...,2.440995,1.781841,1.684261,1.795595,1.673548,1.414155,1.408458,1.420067,1.660976,1.729611




------------------------------------------------
No object data available :
------------------------------------------------


------------------------------------------------
Missing Values :
------------------------------------------------
Series([], dtype: int64)


------------------------------------------------
Percentage of Missing values: (%) 
------------------------------------------------
Series([], dtype: float64)


------------------------------------------------
Total missing values percentage: 0.00%
------------------------------------------------

****************************************************
Details of test_df: 

****************************************************
------------------------------------------------
df shape : (250000, 30)
------------------------------------------------

------------------------------------------------
df top 5 data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
0,0.464983,-0.951804,0.043148,-0.302721,-0.347041,-0.349034,3.775598,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,-0.597533,-0.707136,1.408458,-0.704192,-0.885539,0.735925
1,-1.182356,0.499096,0.0,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,3.306284,...,-0.409669,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,1.408458,-0.704192,0.882875,-1.215957
2,0.147059,0.350421,1.773581,-1.171386,-0.347041,2.865053,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,-0.597533,-0.707136,-0.709996,1.420067,0.034036,-1.677311
3,1.639628,-1.595045,-0.018738,0.565944,-0.347041,2.865053,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,-0.709996,1.420067,-1.380694,0.735925
4,0.250884,-0.077678,-1.603075,0.565944,-0.347041,-0.349034,-0.264859,-0.324308,2.845082,-0.302454,...,2.440995,-0.561217,-0.593732,1.795595,-0.597533,-0.707136,1.408458,-0.704192,0.246246,-0.151294




------------------------------------------------
df info :
------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 30 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Episode_Length_minutes       250000 non-null  float64
 1   Host_Popularity_percentage   250000 non-null  float64
 2   Guest_Popularity_percentage  250000 non-null  float64
 3   Number_of_Ads                250000 non-null  float64
 4   Genre_Business               250000 non-null  float64
 5   Genre_Comedy                 250000 non-null  float64
 6   Genre_Education              250000 non-null  float64
 7   Genre_Health                 250000 non-null  float64
 8   Genre_Lifestyle              250000 non-null  float64
 9   Genre_Music                  250000 non-null  float64
 10  Genre_News                   250000 non-null  float64
 11  Genre_S

None



------------------------------------------------
df describe numeric data :
------------------------------------------------


Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Name,Episode_Title
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,10.124,-0.006992,-0.001122,0.006396,-0.002362,0.001893,0.000583,0.001831,0.002427,0.001459,...,-0.00232,-0.001086,0.000797,0.003432,-0.003055,0.00146,-0.000984,-0.000476,-0.002084,-0.001903
std,5061.399,1.000501,0.999413,3.71302,0.997001,1.002379,1.001025,1.002523,1.003021,1.00219,...,0.99764,0.999339,1.000436,1.002119,0.998353,1.000517,0.999658,0.999832,0.998148,1.000499
min,-2.001351,-2.509401,-2.045671,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-1.66364,-1.783777
25%,-0.8115517,-0.901954,-0.692424,-1.171386,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-0.885539,-0.861069
50%,0.0,0.001033,0.0,-0.302721,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,-0.593732,-0.556918,-0.597533,-0.707136,-0.709996,-0.704192,-0.0367,0.02615
75%,0.8374001,0.853294,0.734851,0.565944,-0.347041,-0.349034,-0.264859,-0.324308,-0.351484,-0.302454,...,-0.409669,-0.561217,1.684261,-0.556918,1.673548,1.414155,1.408458,1.420067,0.882875,0.842391
max,2530699.0,2.531144,2.529911,1790.884559,2.881501,2.865053,3.775598,3.083485,2.845082,3.306284,...,2.440995,1.781841,1.684261,1.795595,1.673548,1.414155,1.408458,1.420067,1.660976,1.729611




------------------------------------------------
No object data available :
------------------------------------------------


------------------------------------------------
Missing Values :
------------------------------------------------
Series([], dtype: int64)


------------------------------------------------
Percentage of Missing values: (%) 
------------------------------------------------
Series([], dtype: float64)


------------------------------------------------
Total missing values percentage: 0.00%
------------------------------------------------


### XGBoost

In [11]:
model = XGBRegressor(
    n_estimators=50000, 
    learning_rate=0.01,
    early_stopping_rounds=10
)

model.fit(X_train, y_train, 
         eval_set=[(X_valid, y_valid)], 
         verbose=False
    )

In [12]:
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 12.9792
MSE: 168.4593
MAE: 9.4205


In [13]:
final_prediction = model.predict(test_df).flatten()
submission = pd.DataFrame({'id': sample_submission_df['id'], 'Listening_Time_minutes': final_prediction})
submission.to_csv('submission.csv', index=False)