In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
data_train.sample(5)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
219648,219648,True Crime Stories,Episode 74,38.44,True Crime,31.76,Saturday,Evening,74.53,3.0,Positive,37.41929
363498,363498,Daily Digest,Episode 89,65.82,News,26.68,Friday,Night,,2.0,Positive,37.07781
76477,76477,Detective Diaries,Episode 80,70.86,True Crime,76.35,Sunday,Morning,84.99,1.0,Neutral,55.8778
546041,546041,Style Guide,Episode 49,20.01,Lifestyle,33.67,Sunday,Evening,67.34,1.0,Positive,11.48762
497187,497187,Gadget Geek,Episode 35,40.14,Technology,97.28,Saturday,Afternoon,,0.0,Negative,32.62992


In [2]:
# clean data
def clean(dataframe):
    dataframe.drop('id', axis=1, inplace=True)
    dataframe.dropna(axis=0, how='any', inplace=True)
    dataframe.drop(dataframe[dataframe['Number_of_Ads']>10].index, inplace=True)
    dataframe.drop(dataframe[dataframe['Episode_Length_minutes']>250].index, inplace=True)
    dataframe['Episode_Title'] = dataframe['Episode_Title'].map(lambda et: int(et.split()[1]))
    dataframe['Episode_Sentiment'] = dataframe['Episode_Sentiment'].map({'Negative': -1, 'Neutral': 0, 'Positive': +1})
    dataframe['Number_of_Ads'] = dataframe['Number_of_Ads'].astype(int)

    return dataframe

data_train = clean(data_train)
data_test = clean(data_test)
data_train.sample(5)

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
406521,Global News,40,8.25,News,46.91,Wednesday,Night,17.41,1,-1,0.0
342905,Humor Hub,96,69.52,Comedy,73.36,Friday,Evening,54.13,0,0,28.88347
476357,Game Day,24,30.08,Sports,32.08,Sunday,Evening,54.32,3,1,22.55175
205137,Style Guide,61,99.59,Lifestyle,59.03,Wednesday,Evening,94.88,1,-1,86.7489
510258,Global News,36,39.85,News,53.77,Saturday,Night,90.67,0,1,20.7821


In [3]:
categorical_columns = data_train.select_dtypes(include=['category', 'object']).columns.tolist()

print(categorical_columns)

['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time']


In [4]:
def feature_eng(df):
    for col in ['Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']:
        encode_dict = {key: val for val, key in enumerate(df[col].unique())}
        df[col] = df[col].replace(encode_dict)
        df[col] = df[col].astype('category')
    
    return df

data_train = feature_eng(data_train)
data_test = feature_eng(data_test)

In [5]:
from sklearn.model_selection import train_test_split
data_train, rest = train_test_split(data_train, test_size=0.999)
data_test, rest = train_test_split(data_test, test_size=0.999)
print(data_train)

       Podcast_Name  Episode_Title  Episode_Length_minutes Genre  \
192104           14             84                   19.69     2   
604876           23             73                   23.14     6   
12284            21              4                   29.49     6   
220731           41             30                  107.06     4   
57552            14             30                    7.02     2   
...             ...            ...                     ...   ...   
339242           29             73                   50.67     9   
725848           40             60                  100.20     4   
644600           27             67                   53.01     8   
437930           30             37                   71.63     1   
184847            7             13                   68.07     7   

        Host_Popularity_percentage Publication_Day Publication_Time  \
192104                       38.27               4                0   
604876                       89.50       

In [6]:
from itertools import combinations

encode_columns = ['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']
#encode_columns = ['Episode_Length_minutes', 'Episode_Title', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Publication_Day', 'Publication_Time']
pair_size = [2, 3, 4]

for r in pair_size:
    for cols in list(combinations(encode_columns, r)):
        new_col_name = '_'.join(cols)

        data_train[new_col_name] = data_train[list(cols)].astype(str).agg('_'.join, axis=1)
        data_train[new_col_name] = data_train[new_col_name].astype('category')

        data_test[new_col_name] = data_test[list(cols)].astype(str).agg('_'.join, axis=1)
        data_test[new_col_name] = data_test[new_col_name].astype('category')

print(data_train)

       Podcast_Name  Episode_Title  Episode_Length_minutes Genre  \
192104           14             84                   19.69     2   
604876           23             73                   23.14     6   
12284            21              4                   29.49     6   
220731           41             30                  107.06     4   
57552            14             30                    7.02     2   
...             ...            ...                     ...   ...   
339242           29             73                   50.67     9   
725848           40             60                  100.20     4   
644600           27             67                   53.01     8   
437930           30             37                   71.63     1   
184847            7             13                   68.07     7   

        Host_Popularity_percentage Publication_Day Publication_Time  \
192104                       38.27               4                0   
604876                       89.50       

In [7]:
len(data_train['Publication_Time_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment'].unique())

539

In [8]:
def target_encode(df_train, df_test, target_col, col_list):
    """ Target encode all columns listed in col_list, both in training and test DataFrames.
        (Using the mean of the target column from the training DataFrame.)
        Doing it like this, the target values from the validation fold are used in the mean,
        not just from the training folds - but that probably is ok.
    Args:
        df_train (pd.DataFrame): Training DataFrame.
        df_test (pd.DataFrame): Test DataFrame.
        target_col (str): Name of the target column to use for the encoding.
        col_list (List[str]): List of column names to target encode.
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: Training and test DataFrames with encoded columns.
    """
    for col in col_list:
        groupby_df = df_train[[target_col, col]].groupby([col]).mean()
        groupby_df.sort_values(by=target_col, inplace=True)
        mapping_dict = groupby_df[target_col].to_dict()
        df_train[col] = df_train[col].map(mapping_dict).astype(float)
        df_test[col] = df_test[col].map(mapping_dict).astype(float)

    return (df_train, df_test)

category_columns = data_train.select_dtypes(include=['category', 'object']).columns.tolist()
data_train, data_test = target_encode(data_train, data_test, 'Listening_Time_minutes', category_columns)

In [9]:
categorical_columns = data_train.select_dtypes(include=['category', 'object']).columns.tolist()

print(categorical_columns)

['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment', 'Podcast_Name_Genre', 'Podcast_Name_Publication_Day', 'Podcast_Name_Publication_Time', 'Podcast_Name_Number_of_Ads', 'Podcast_Name_Episode_Sentiment', 'Episode_Title_Publication_Time', 'Episode_Title_Episode_Sentiment', 'Genre_Publication_Day', 'Genre_Publication_Time', 'Genre_Number_of_Ads', 'Genre_Episode_Sentiment', 'Publication_Day_Publication_Time', 'Publication_Day_Number_of_Ads', 'Publication_Day_Episode_Sentiment', 'Publication_Time_Number_of_Ads', 'Publication_Time_Episode_Sentiment', 'Number_of_Ads_Episode_Sentiment', 'Podcast_Name_Genre_Publication_Day', 'Podcast_Name_Genre_Publication_Time', 'Podcast_Name_Genre_Number_of_Ads', 'Podcast_Name_Genre_Episode_Sentiment', 'Podcast_Name_Publication_Time_Number_of_Ads', 'Podcast_Name_Publication_Time_Episode_Sentiment', 'Podcast_Name_Number_of_Ads_Episode_Sentiment', 'Genre_Publication_Day_Publication_Time', 'Genre_Publication_Day_Number_of_Ads', 

In [10]:
data_train.sample(5)

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,...,Host_Popularity_percentage_Publication_Day_Number_of_Ads_Episode_Sentiment,Host_Popularity_percentage_Publication_Time_Guest_Popularity_percentage_Number_of_Ads,Host_Popularity_percentage_Publication_Time_Guest_Popularity_percentage_Episode_Sentiment,Host_Popularity_percentage_Publication_Time_Number_of_Ads_Episode_Sentiment,Host_Popularity_percentage_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Publication_Day_Publication_Time_Guest_Popularity_percentage_Number_of_Ads,Publication_Day_Publication_Time_Guest_Popularity_percentage_Episode_Sentiment,Publication_Day_Publication_Time_Number_of_Ads_Episode_Sentiment,Publication_Day_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Publication_Time_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment
709483,32.426498,74,34.3,39.550334,37.36,47.598365,49.415535,87.68,3,43.378716,...,20.22642,20.22642,20.22642,20.22642,20.22642,20.22642,20.22642,56.038575,20.22642,20.22642
239961,45.330839,60,111.95,41.497246,67.18,46.984605,49.415535,24.15,2,49.094071,...,67.60855,67.60855,67.60855,67.60855,67.60855,67.60855,67.60855,38.253397,67.60855,67.60855
189242,56.37875,48,97.1,50.391845,96.56,46.36782,45.72401,35.57,0,50.521189,...,96.57777,96.57777,96.57777,96.57777,96.57777,96.57777,96.57777,60.26582,96.57777,96.57777
469449,37.964234,5,105.34,41.497246,30.94,49.779779,49.415535,15.96,3,49.094071,...,83.40108,83.40108,83.40108,83.40108,83.40108,83.40108,83.40108,60.622502,83.40108,83.40108
535271,61.473396,71,65.98,53.325072,32.0,48.883059,49.415535,28.36,0,50.521189,...,50.58493,50.58493,50.58493,50.58493,50.58493,50.58493,50.58493,75.042255,50.58493,50.58493
