# Imports

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from sklearn.impute import SimpleImputer

## Lendo o CSV

In [3]:
df = pd.read_csv('train.csv')

## Explorando os dados no CSV

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [5]:
df.describe()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes
count,750000.0,662907.0,750000.0,603970.0,749999.0,750000.0
mean,374999.5,64.504738,59.859901,52.236449,1.348855,45.437406
std,216506.495284,32.969603,22.873098,28.451241,1.15113,27.138306
min,0.0,0.0,1.3,0.0,0.0,0.0
25%,187499.75,35.73,39.41,28.38,0.0,23.17835
50%,374999.5,63.84,60.05,53.58,1.0,43.37946
75%,562499.25,94.07,79.53,76.6,2.0,64.81158
max,749999.0,325.24,119.46,119.91,103.91,119.97


In [6]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [7]:
df['Genre'].value_counts()

Genre
Sports        87606
Technology    86256
True Crime    85059
Lifestyle     82461
Comedy        81453
Business      80521
Health        71416
News          63385
Music         62743
Education     49100
Name: count, dtype: int64

In [8]:
df['Podcast_Name'].value_counts()

Podcast_Name
Tech Talks             22847
Sports Weekly          20053
Funny Folks            19635
Tech Trends            19549
Fitness First          19488
Business Insights      19480
Style Guide            19364
Game Day               19272
Melody Mix             18889
Criminal Minds         17735
Finance Focus          17628
Detective Diaries      17452
Crime Chronicles       17374
Athlete's Arena        17327
Fashion Forward        17280
Tune Time              17254
Business Briefs        17012
Lifestyle Lounge       16661
True Crime Stories     16373
Sports Central         16191
Digital Digest         16171
Humor Hub              16144
Mystery Matters        16002
Comedy Corner          15927
Joke Junction          15074
Wellness Wave          15009
Sport Spot             14778
Gadget Geek            14770
Home & Living          14686
Laugh Line             14673
Life Lessons           14464
World Watch            14043
Sound Waves            13928
Global News            13649
M

# Tratando os dados

In [9]:
df['Episode_Number'] = df['Episode_Title'].str.extract(r'(\d+)').astype(int)
df = df.drop('Episode_Title', axis=1)

In [10]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Number
0,0,Mystery Matters,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998,98
1,1,Joke Junction,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241,26
2,2,Study Sessions,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531,16
3,3,Digital Digest,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824,45
4,4,Mind & Body,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031,86


## Aplicando One Hot Encodding nas colunas necessárias

In [11]:
columns_to_one_hot = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
df = pd.get_dummies(df, columns=columns_to_one_hot)

In [12]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Episode_Number,Genre_Business,Genre_Comedy,...,Publication_Day_Thursday,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive
0,0,Mystery Matters,,74.81,,0.0,31.41998,98,False,False,...,True,False,False,False,False,False,True,False,False,True
1,1,Joke Junction,119.8,66.95,75.95,2.0,88.01241,26,False,True,...,False,False,False,True,False,False,False,True,False,False
2,2,Study Sessions,73.9,69.97,8.97,0.0,44.92531,16,False,False,...,False,True,False,False,True,False,False,True,False,False
3,3,Digital Digest,67.17,57.22,78.7,2.0,46.27824,45,False,False,...,False,False,False,False,False,True,False,False,False,True
4,4,Mind & Body,110.51,80.07,58.68,3.0,75.61031,86,False,False,...,False,False,False,True,False,False,False,False,True,False


## Embedding do nome do Podcast

In [13]:
import numpy as np

# Caminho do arquivo GloVe
GLOVE_PATH = "glove.6B/glove.6B.100d.txt"  # por exemplo

# Carregar GloVe em memória
def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Função para vetorizar um título
def embed_title(title, embeddings, dim=100):
    words = title.lower().split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

# Exemplo de uso
glove = load_glove_embeddings(GLOVE_PATH)
# vetor = embed_title("Smartphone com câmera ultrawide", glove)
# print(vetor.shape)  # (100,)


In [14]:
df['Podcast_Embedding'] = df['Podcast_Name'].apply(lambda x: embed_title(x, glove, dim=100))

In [15]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Episode_Number,Genre_Business,Genre_Comedy,...,Publication_Day_Tuesday,Publication_Day_Wednesday,Publication_Time_Afternoon,Publication_Time_Evening,Publication_Time_Morning,Publication_Time_Night,Episode_Sentiment_Negative,Episode_Sentiment_Neutral,Episode_Sentiment_Positive,Podcast_Embedding
0,0,Mystery Matters,,74.81,,0.0,31.41998,98,False,False,...,False,False,False,False,False,True,False,False,True,"[-0.061355002, -0.052438498, 0.841655, 0.23137..."
1,1,Joke Junction,119.8,66.95,75.95,2.0,88.01241,26,False,True,...,False,False,True,False,False,False,True,False,False,"[-0.142665, -0.23665701, 0.3445005, -0.3208649..."
2,2,Study Sessions,73.9,69.97,8.97,0.0,44.92531,16,False,False,...,True,False,False,True,False,False,True,False,False,"[0.072463, 0.395992, -0.396025, 0.2693015, 0.0..."
3,3,Digital Digest,67.17,57.22,78.7,2.0,46.27824,45,False,False,...,False,False,False,False,True,False,False,False,True,"[-0.417915, -0.168632, 0.23964, -0.13462, 0.11..."
4,4,Mind & Body,110.51,80.07,58.68,3.0,75.61031,86,False,False,...,False,False,True,False,False,False,False,True,False,"[-0.06228867, -0.056680005, 0.062063333, -0.36..."


### Expandindo o embedding em colunas

In [17]:
# Transforma coluna com vetores em múltiplas colunas numéricas
embedding_cols = pd.DataFrame(df['Podcast_Embedding'].tolist(),
                               columns=[f'Podcast_Embed_{i}' for i in range(100)])

# Junta com o DataFrame original
df = pd.concat([df, embedding_cols], axis=1)

# Opcional: remover a coluna com o vetor bruto
df.drop(columns=['Podcast_Embedding'], inplace=True)


In [18]:
df.head()

Unnamed: 0,id,Podcast_Name,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Episode_Number,Genre_Business,Genre_Comedy,...,Podcast_Embed_90,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99
0,0,Mystery Matters,,74.81,,0.0,31.41998,98,False,False,...,-0.191959,0.259265,-0.212915,-0.08297,-0.056235,0.480065,0.12427,-0.42141,0.28618,0.225935
1,1,Joke Junction,119.8,66.95,75.95,2.0,88.01241,26,False,True,...,-0.35121,0.363415,-0.271596,0.415645,-0.2019,-0.19343,0.506583,-0.123942,0.56817,0.74714
2,2,Study Sessions,73.9,69.97,8.97,0.0,44.92531,16,False,False,...,-0.11657,-0.232258,-0.052785,0.449284,0.106903,0.496355,-0.25975,-0.29327,0.43791,-0.020877
3,3,Digital Digest,67.17,57.22,78.7,2.0,46.27824,45,False,False,...,0.649245,0.759525,0.154494,-0.60361,-0.345495,-0.295645,0.586545,-0.910115,1.01095,0.730029
4,4,Mind & Body,110.51,80.07,58.68,3.0,75.61031,86,False,False,...,0.046611,-0.249599,0.221633,0.57514,0.040458,0.28805,-0.376367,-0.713273,0.73138,0.116122


In [19]:
df.drop(columns=['Podcast_Name'], inplace=True)

In [20]:
df.head()

Unnamed: 0,id,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,...,Podcast_Embed_90,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99
0,0,,74.81,,0.0,31.41998,98,False,False,False,...,-0.191959,0.259265,-0.212915,-0.08297,-0.056235,0.480065,0.12427,-0.42141,0.28618,0.225935
1,1,119.8,66.95,75.95,2.0,88.01241,26,False,True,False,...,-0.35121,0.363415,-0.271596,0.415645,-0.2019,-0.19343,0.506583,-0.123942,0.56817,0.74714
2,2,73.9,69.97,8.97,0.0,44.92531,16,False,False,True,...,-0.11657,-0.232258,-0.052785,0.449284,0.106903,0.496355,-0.25975,-0.29327,0.43791,-0.020877
3,3,67.17,57.22,78.7,2.0,46.27824,45,False,False,False,...,0.649245,0.759525,0.154494,-0.60361,-0.345495,-0.295645,0.586545,-0.910115,1.01095,0.730029
4,4,110.51,80.07,58.68,3.0,75.61031,86,False,False,False,...,0.046611,-0.249599,0.221633,0.57514,0.040458,0.28805,-0.376367,-0.713273,0.73138,0.116122


## Tratando dos dados NaN

### Removendo a coluna de IDs

In [21]:
ids_col = df.pop('id')

In [22]:
ids_col.head()

0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

In [23]:
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Listening_Time_minutes,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,...,Podcast_Embed_90,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99
0,,74.81,,0.0,31.41998,98,False,False,False,False,...,-0.191959,0.259265,-0.212915,-0.08297,-0.056235,0.480065,0.12427,-0.42141,0.28618,0.225935
1,119.8,66.95,75.95,2.0,88.01241,26,False,True,False,False,...,-0.35121,0.363415,-0.271596,0.415645,-0.2019,-0.19343,0.506583,-0.123942,0.56817,0.74714
2,73.9,69.97,8.97,0.0,44.92531,16,False,False,True,False,...,-0.11657,-0.232258,-0.052785,0.449284,0.106903,0.496355,-0.25975,-0.29327,0.43791,-0.020877
3,67.17,57.22,78.7,2.0,46.27824,45,False,False,False,False,...,0.649245,0.759525,0.154494,-0.60361,-0.345495,-0.295645,0.586545,-0.910115,1.01095,0.730029
4,110.51,80.07,58.68,3.0,75.61031,86,False,False,False,True,...,0.046611,-0.249599,0.221633,0.57514,0.040458,0.28805,-0.376367,-0.713273,0.73138,0.116122


### Removendo a coluna objetivo

In [24]:
y_col = df.pop('Listening_Time_minutes')

In [25]:
y_col.head()

0    31.41998
1    88.01241
2    44.92531
3    46.27824
4    75.61031
Name: Listening_Time_minutes, dtype: float64

In [26]:
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,...,Podcast_Embed_90,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99
0,,74.81,,0.0,98,False,False,False,False,False,...,-0.191959,0.259265,-0.212915,-0.08297,-0.056235,0.480065,0.12427,-0.42141,0.28618,0.225935
1,119.8,66.95,75.95,2.0,26,False,True,False,False,False,...,-0.35121,0.363415,-0.271596,0.415645,-0.2019,-0.19343,0.506583,-0.123942,0.56817,0.74714
2,73.9,69.97,8.97,0.0,16,False,False,True,False,False,...,-0.11657,-0.232258,-0.052785,0.449284,0.106903,0.496355,-0.25975,-0.29327,0.43791,-0.020877
3,67.17,57.22,78.7,2.0,45,False,False,False,False,False,...,0.649245,0.759525,0.154494,-0.60361,-0.345495,-0.295645,0.586545,-0.910115,1.01095,0.730029
4,110.51,80.07,58.68,3.0,86,False,False,False,True,False,...,0.046611,-0.249599,0.221633,0.57514,0.040458,0.28805,-0.376367,-0.713273,0.73138,0.116122


### Normalizando os dados

In [27]:
scaler = StandardScaler()
scaler.fit(df)

In [28]:
dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [29]:
df.columns[1:]

Index(['Host_Popularity_percentage', 'Guest_Popularity_percentage',
       'Number_of_Ads', 'Episode_Number', 'Genre_Business', 'Genre_Comedy',
       'Genre_Education', 'Genre_Health', 'Genre_Lifestyle', 'Genre_Music',
       ...
       'Podcast_Embed_90', 'Podcast_Embed_91', 'Podcast_Embed_92',
       'Podcast_Embed_93', 'Podcast_Embed_94', 'Podcast_Embed_95',
       'Podcast_Embed_96', 'Podcast_Embed_97', 'Podcast_Embed_98',
       'Podcast_Embed_99'],
      dtype='object', length=128)

In [30]:
df = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [31]:
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,...,Podcast_Embed_90,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99
0,,0.653611,,-1.171766,1.657582,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,-1.023713,0.418261,-0.776574,-0.088732,0.567832,1.608396,0.484065,-0.149683,-0.924076,-0.285117
1,1.67716,0.309976,0.833481,0.565657,-0.906009,-0.346806,2.86492,-0.264675,-0.324411,-0.351468,...,-1.514966,0.749341,-0.9351,1.541608,0.168647,-0.951429,1.436345,0.634674,-0.250853,1.070425
2,0.284968,0.442009,-1.520724,-1.171766,-1.262063,-0.346806,-0.34905,3.77822,-0.324411,-0.351468,...,-0.791153,-1.144233,-0.343991,1.651599,1.014901,1.670312,-0.472468,0.188194,-0.561836,-0.927021
3,0.08084,-0.115415,0.930138,0.565657,-0.229506,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,1.571219,2.008528,0.215965,-1.79109,-0.224865,-1.339929,1.635518,-1.438292,0.806239,1.025922
4,1.395385,0.883576,0.226477,1.434369,1.230317,-0.346806,-0.34905,-0.264675,3.082507,-0.351468,...,-0.287775,-1.199357,0.397337,2.063115,0.832813,0.878584,-0.762942,-0.919263,0.138794,-0.570716


### Normalizando a coluna alvo

In [32]:
scaler_y = StandardScaler()
scaler_y.fit(y_col.values.reshape(-1, 1))

In [33]:
dump(scaler_y, 'scaler_y.joblib')

['scaler_y.joblib']

In [36]:
df_y = pd.DataFrame(scaler_y.transform(y_col.values.reshape(-1, 1)), columns=['Listening_Time_minutes'])

In [37]:
df_y.head()

Unnamed: 0,Listening_Time_minutes
0,-0.516518
1,1.568817
2,-0.01887
3,0.030983
4,1.11182


### Juntando novamente com a coluna alvo

In [38]:
df = pd.concat([df, df_y], axis=1)
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,...,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99,Listening_Time_minutes
0,,0.653611,,-1.171766,1.657582,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,0.418261,-0.776574,-0.088732,0.567832,1.608396,0.484065,-0.149683,-0.924076,-0.285117,-0.516518
1,1.67716,0.309976,0.833481,0.565657,-0.906009,-0.346806,2.86492,-0.264675,-0.324411,-0.351468,...,0.749341,-0.9351,1.541608,0.168647,-0.951429,1.436345,0.634674,-0.250853,1.070425,1.568817
2,0.284968,0.442009,-1.520724,-1.171766,-1.262063,-0.346806,-0.34905,3.77822,-0.324411,-0.351468,...,-1.144233,-0.343991,1.651599,1.014901,1.670312,-0.472468,0.188194,-0.561836,-0.927021,-0.01887
3,0.08084,-0.115415,0.930138,0.565657,-0.229506,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,2.008528,0.215965,-1.79109,-0.224865,-1.339929,1.635518,-1.438292,0.806239,1.025922,0.030983
4,1.395385,0.883576,0.226477,1.434369,1.230317,-0.346806,-0.34905,-0.264675,3.082507,-0.351468,...,-1.199357,0.397337,2.063115,0.832813,0.878584,-0.762942,-0.919263,0.138794,-0.570716,1.11182


### Usando o KNNImputer para preencher os valores NaN

In [39]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(df)

In [40]:
dump(imputer, 'imputer.joblib')

['imputer.joblib']

In [41]:
df = pd.DataFrame(imputer.transform(df), columns=df.columns)

In [42]:
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,...,Podcast_Embed_91,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99,Listening_Time_minutes
0,-3.477432e-16,0.653611,-3.004898e-16,-1.171766,1.657582,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,0.418261,-0.776574,-0.088732,0.567832,1.608396,0.484065,-0.149683,-0.924076,-0.285117,-0.516518
1,1.67716,0.309976,0.8334811,0.565657,-0.906009,-0.346806,2.86492,-0.264675,-0.324411,-0.351468,...,0.749341,-0.9351,1.541608,0.168647,-0.951429,1.436345,0.634674,-0.250853,1.070425,1.568817
2,0.2849676,0.442009,-1.520724,-1.171766,-1.262063,-0.346806,-0.34905,3.77822,-0.324411,-0.351468,...,-1.144233,-0.343991,1.651599,1.014901,1.670312,-0.472468,0.188194,-0.561836,-0.927021,-0.01887
3,0.08084003,-0.115415,0.9301377,0.565657,-0.229506,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,2.008528,0.215965,-1.79109,-0.224865,-1.339929,1.635518,-1.438292,0.806239,1.025922,0.030983
4,1.395385,0.883576,0.2264772,1.434369,1.230317,-0.346806,-0.34905,-0.264675,3.082507,-0.351468,...,-1.199357,0.397337,2.063115,0.832813,0.878584,-0.762942,-0.919263,0.138794,-0.570716,1.11182


### Juntando a coluna de ID novamente

In [43]:
df['id'] = ids_col
df.head()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,...,Podcast_Embed_92,Podcast_Embed_93,Podcast_Embed_94,Podcast_Embed_95,Podcast_Embed_96,Podcast_Embed_97,Podcast_Embed_98,Podcast_Embed_99,Listening_Time_minutes,id
0,-3.477432e-16,0.653611,-3.004898e-16,-1.171766,1.657582,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,-0.776574,-0.088732,0.567832,1.608396,0.484065,-0.149683,-0.924076,-0.285117,-0.516518,0
1,1.67716,0.309976,0.8334811,0.565657,-0.906009,-0.346806,2.86492,-0.264675,-0.324411,-0.351468,...,-0.9351,1.541608,0.168647,-0.951429,1.436345,0.634674,-0.250853,1.070425,1.568817,1
2,0.2849676,0.442009,-1.520724,-1.171766,-1.262063,-0.346806,-0.34905,3.77822,-0.324411,-0.351468,...,-0.343991,1.651599,1.014901,1.670312,-0.472468,0.188194,-0.561836,-0.927021,-0.01887,2
3,0.08084003,-0.115415,0.9301377,0.565657,-0.229506,-0.346806,-0.34905,-0.264675,-0.324411,-0.351468,...,0.215965,-1.79109,-0.224865,-1.339929,1.635518,-1.438292,0.806239,1.025922,0.030983,3
4,1.395385,0.883576,0.2264772,1.434369,1.230317,-0.346806,-0.34905,-0.264675,3.082507,-0.351468,...,0.397337,2.063115,0.832813,0.878584,-0.762942,-0.919263,0.138794,-0.570716,1.11182,4


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Columns: 131 entries, Episode_Length_minutes to id
dtypes: float64(130), int64(1)
memory usage: 749.6 MB


# Gravando o dataframe processado num novo arquivo CSV

In [45]:
df.to_csv('train_processed.csv', index=False)