## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

sns.set_style("whitegrid")
sns.set_palette("pastel")

In [2]:
df = pd.read_csv("../data/english_data.csv")
df.head()

Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27
3,UgwMOh95MfK0GuXLLrF4AaABAg,31KTdfRH6nY,Building web applications in Java with Spring ...,@finnianthehuman,UCwQ2Z03nOcMxWozBb_Cv66w,Thanks for the great video.\n\nI don't underst...,Neutral,0,1,2024-09-18 12:04:12,US,27
4,UgxJuUe5ysG8OSbABAl4AaABAg,-hV6aeyPHPA,After a new engine her car dies on her way hom...,@ryoutubeplaylistb6137,UCTTcJ0tsAKQokmHB2qVb1qQ,Good person helping good people.\nThis is how ...,Positive,3,1,2025-01-10 19:39:03,US,2


### Select required columns only

In [3]:
df = df[["CommentText", "Sentiment"]]
df.rename(columns={"CommentText": "Comment"}, inplace=True)

In [4]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Anyone know what movie this is?,Neutral
1,The fact they're holding each other back while...,Positive
2,waiting next video will be?,Neutral
3,Thanks for the great video.\n\nI don't underst...,Neutral
4,Good person helping good people.\nThis is how ...,Positive


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032225 entries, 0 to 1032224
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   Comment    1032225 non-null  object
 1   Sentiment  1032225 non-null  object
dtypes: object(2)
memory usage: 15.8+ MB


### Handle Nulls

In [7]:
df["Comment"] = df["Comment"].str.strip()
null_comments = df[df["Comment"].isnull() | (df["Comment"] == "")]
null_comments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161 entries, 4604 to 1020885
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    161 non-null    object
 1   Sentiment  161 non-null    object
dtypes: object(2)
memory usage: 3.8+ KB


In [8]:
df = df[df["Comment"].notna() & (df["Comment"] != "")]
df[df["Comment"].isnull() | (df["Comment"] == "")]

Unnamed: 0,Comment,Sentiment


### Handle duplicates

In [9]:
print(df.duplicated().sum())

40335


In [10]:
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991729 entries, 0 to 1032224
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Comment    991729 non-null  object
 1   Sentiment  991729 non-null  object
dtypes: object(2)
memory usage: 22.7+ MB


### Handle non english language 

In [12]:
df["Sentiment"].value_counts()

Sentiment
Negative    339843
Neutral     327086
Positive    324800
Name: count, dtype: int64

In [13]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Anyone know what movie this is?,Neutral
1,The fact they're holding each other back while...,Positive
2,waiting next video will be?,Neutral
3,Thanks for the great video.\n\nI don't underst...,Neutral
4,Good person helping good people.\nThis is how ...,Positive


In [14]:
import langid

def detect_language(text):
    try:
        return langid.classify(text)[0]
    except Exception:
        return None

df["language"] = df["Comment"].apply(detect_language)

In [15]:
df.head()

Unnamed: 0,Comment,Sentiment,language
0,Anyone know what movie this is?,Neutral,en
1,The fact they're holding each other back while...,Positive,en
2,waiting next video will be?,Neutral,en
3,Thanks for the great video.\n\nI don't underst...,Neutral,en
4,Good person helping good people.\nThis is how ...,Positive,en


In [16]:
df.value_counts("language")

language
en    869421
zh     10061
tl      9209
es      9096
ru      8291
       ...  
ky        46
ps        34
te        33
gu        29
ug        14
Name: count, Length: 97, dtype: int64

In [17]:
df = df[df["language"] == "en"]
df.shape

(869421, 3)

In [18]:
df.value_counts("Sentiment")

Sentiment
Negative    308079
Neutral     282791
Positive    278551
Name: count, dtype: int64

In [19]:
# not using neutral comments
df = df[df["Sentiment"] != "Neutral"]
df["Sentiment"].value_counts()

Sentiment
Negative    308079
Positive    278551
Name: count, dtype: int64

In [20]:
# Using 220000 rows only
each_class = 220000

df_positive = df[df["Sentiment"] == "Positive"].iloc[:each_class]
df_negative = df[df["Sentiment"] == "Negative"].iloc[:each_class]

new_df = pd.concat([df_positive, df_negative])
df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

df["Sentiment"].value_counts()

Sentiment
Positive    220000
Negative    220000
Name: count, dtype: int64

In [21]:
df.drop(columns=["language"], inplace=True)
df.head()

Unnamed: 0,Comment,Sentiment
0,"What Asmon said is not contradictory, but 2 se...",Positive
1,"Sir, even with signed execution, when I verify...",Negative
2,"When I use 'git master', I receive: 'git: 'mas...",Negative
3,Russiagate was bullshit? Are you sure about th...,Negative
4,Slipper is the universal weapon of moms!!! ðŸ˜‚ðŸ˜‚ðŸ˜‚,Positive


In [22]:
df.to_csv("../data/english_only_data.csv", index=False)