In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
data_train.sample(5)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
409396,409396,Health Hour,Episode 46,94.62,Health,72.03,Thursday,Night,45.84,1.0,Negative,73.37494
718327,718327,Humor Hub,Episode 40,42.52,Comedy,80.51,Monday,Night,38.55,0.0,Neutral,41.43918
173975,173975,Digital Digest,Episode 38,35.74,Technology,51.04,Tuesday,Evening,19.08,0.0,Neutral,19.71374
483371,483371,Sound Waves,Episode 99,74.41,Music,43.17,Sunday,Evening,97.53,3.0,Neutral,39.28556
694392,694392,Fashion Forward,Episode 82,,Lifestyle,80.85,Saturday,Afternoon,11.74,3.0,Neutral,22.3204


In [2]:
# clean data
def clean(dataframe):
    dataframe.drop('id', axis=1, inplace=True)
    dataframe.dropna(axis=0, how='any', inplace=True)
    dataframe.drop(dataframe[dataframe['Number_of_Ads']>10].index, inplace=True)
    dataframe.drop(dataframe[dataframe['Episode_Length_minutes']>250].index, inplace=True)
    dataframe['Episode_Title'] = dataframe['Episode_Title'].map(lambda et: int(et.split()[1]))
    dataframe['Episode_Sentiment'] = dataframe['Episode_Sentiment'].map({'Negative': -1, 'Neutral': 0, 'Positive': +1})
    dataframe['Number_of_Ads'] = dataframe['Number_of_Ads'].astype(int)

    return dataframe

data_train = clean(data_train)
data_test = clean(data_test)
data_train.sample(5)

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
390661,Gadget Geek,79,46.12,Technology,53.17,Thursday,Morning,45.49,0,-1,23.85158
680568,Daily Digest,29,103.49,News,54.47,Friday,Night,30.85,2,0,61.30796
87699,Style Guide,100,111.89,Lifestyle,54.84,Saturday,Afternoon,74.81,1,1,71.58727
165825,Melody Mix,76,71.22,Music,86.04,Thursday,Morning,81.24,0,0,48.02613
362842,Style Guide,2,115.67,Lifestyle,43.57,Monday,Night,19.67,3,0,58.19244


In [3]:
from collections import Counter

def check_duplicate_counts(mapping_dict):
    values = list(mapping_dict.values())
    return len(values) != len(set(values))

def count_encode(df_train, df_test, target_col, col_list):
    for col in col_list:
        mapping_dict_train = df_train[col].value_counts().to_dict()
        mapping_dict_test = df_test[col].value_counts().to_dict()
        
        mapping_dict = dict(Counter(mapping_dict_train) + Counter(mapping_dict_test))
        
        df_train[col] = df_train[col].map(mapping_dict).astype(int)
        df_test[col] = df_test[col].map(mapping_dict).astype(int)

    return (df_train, df_test)

data_train, data_test = count_encode(data_train, data_test, 'Listening_Time_minutes', ['Podcast_Name', 'Genre'])
data_train.sample(5)

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
127953,16711,9,61.11,80710,78.69,Friday,Evening,57.44,1,0,58.80115
307899,13479,53,112.08,81049,33.08,Saturday,Night,33.85,0,1,108.78
541732,12897,16,64.63,75722,67.41,Thursday,Night,94.57,3,1,44.1749
324573,15337,18,36.53,79430,24.29,Thursday,Evening,92.57,3,-1,29.9822
655663,19197,82,106.55,69654,39.63,Thursday,Night,5.29,1,0,88.55101


In [4]:
len(data_train)

539040