# Data Preprocessing

In [41]:
import numpy as np
import pandas as pd
import re
import nltk
import string

In [100]:
df = pd.read_csv("../data/raw/comments.csv", index_col=0, sep=',')
df["Comment"] = df["Comment"].astype(str)
df.head()

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Let's not forget that Apple Pay in 2014 requir...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50% of retailers don’t even have co...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,"Apple Pay is so convenient, secure, and easy t...",34.0,2.0


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18409 entries, 0 to 18408
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Video ID   18409 non-null  object 
 1   Comment    18409 non-null  object 
 2   Likes      18409 non-null  float64
 3   Sentiment  18409 non-null  float64
dtypes: float64(2), object(2)
memory usage: 719.1+ KB


In [102]:
from data import make_dataset

df_clean = make_dataset.clean_data(df, text_col='Comment')
df_clean

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Lets not forget that Apple Pay in 2014 require...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50 of retailers don’t even have con...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,Apple Pay is so convenient secure and easy to ...,34.0,2.0
...,...,...,...,...
18404,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
18405,cyLWtMSry58,I’ve just started exploring this field And thi...,20.0,2.0
18406,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0
18407,cyLWtMSry58,Hey Daniel just discovered your channel a coup...,35.0,2.0


There are 100 videos with less then 10 comments:

In [103]:
df_video = df_clean.groupby('Video ID').count().reset_index()
df_video = df_video[['Video ID','Comment']]
sum(df_video['Comment'] < 10)

100

As the number of comments in those videos is very disperse, we will delete those videos from the data set.

In [104]:
df_video[df_video['Comment'] < 10]

Unnamed: 0,Video ID,Comment
9,-D4S6TpnO44,2
21,-VVLmBXHvL8,5
28,-o380SFxMrg,6
63,0vEdNPoeemQ,1
73,191wd4NscHo,7
...,...,...
1730,vcmv3tfuk9I,7
1734,vl9E6lND4V0,9
1736,voU9-39bicg,1
1763,wjhbTqGt5tc,1


We will keep the videos with 20 comments.

In [105]:
df_video[df_video['Comment'] > 10]

Unnamed: 0,Video ID,Comment
111,2FYvHn12pOQ,20
178,4mgePWWCAmA,20
203,5q87K1WaoFI,20
251,7eh4d6sabA0,20
283,96mrgd8-3yE,20
630,LeC5yJq4tsI,20
795,R-R0KrXvWbc,20
808,RFMi3v0TXP8,20
949,WBK2_lD7KGA,16
1051,ZgeorpjGJC0,20


In [106]:
video_comments = dict(df_video.values)

In [107]:
df_clean['Num_Comments'] = [video_comments[video] for video in df_clean['Video ID']]

In [108]:
df_clean

Unnamed: 0,Video ID,Comment,Likes,Sentiment,Num_Comments
0,wAZZ-UWGVHI,Lets not forget that Apple Pay in 2014 require...,95.0,1.0,10
1,wAZZ-UWGVHI,Here in NZ 50 of retailers don’t even have con...,19.0,0.0,10
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0,10
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0,10
4,wAZZ-UWGVHI,Apple Pay is so convenient secure and easy to ...,34.0,2.0,10
...,...,...,...,...,...
18404,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0,10
18405,cyLWtMSry58,I’ve just started exploring this field And thi...,20.0,2.0,10
18406,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0,10
18407,cyLWtMSry58,Hey Daniel just discovered your channel a coup...,35.0,2.0,10


In [111]:
df_clean = df_clean.loc[df_clean['Num_Comments'].isin([10,20])].reset_index(drop=True)
df_clean = df_clean.drop(columns=['Num_Comments'])
df_clean

Unnamed: 0,Video ID,Comment,Likes,Sentiment
0,wAZZ-UWGVHI,Lets not forget that Apple Pay in 2014 require...,95.0,1.0
1,wAZZ-UWGVHI,Here in NZ 50 of retailers don’t even have con...,19.0,0.0
2,wAZZ-UWGVHI,I will forever acknowledge this channel with t...,161.0,2.0
3,wAZZ-UWGVHI,Whenever I go to a place that doesn’t take App...,8.0,0.0
4,wAZZ-UWGVHI,Apple Pay is so convenient secure and easy to ...,34.0,2.0
...,...,...,...,...
17785,cyLWtMSry58,I really like the point about engineering tool...,0.0,2.0
17786,cyLWtMSry58,I’ve just started exploring this field And thi...,20.0,2.0
17787,cyLWtMSry58,Excelente video con una pregunta filosófica pr...,1.0,1.0
17788,cyLWtMSry58,Hey Daniel just discovered your channel a coup...,35.0,2.0
