## Data preprocessing

In [1]:
import pandas as pd
import emoji as emoji_lib
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## Read dataset

In [2]:
# read the dataset
df = pd.read_csv('./youtube_comments_new.csv')
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,My friends loved it and agreed with everything...,positive
1,Amadanman85,I vow never ever too see this Movie.,negative
2,A Chambers,The real world.... A thought experiment......🇬🇧,neutral
3,Irisdew,I really enjoyed the movie and didn&#39;t find...,positive
4,Ben,I hope she’s joking. It’s a movie about Barbie...,positive
...,...,...,...
17500,Daz Hatz,It makes you want to puke doesn&#39;t it.,negative
17501,4EVERSEEKING WISDOM,If Cleopatra was supposed to be so beautiful t...,negative
17502,Senni,what a time we live in when the &quot;Asterix ...,neutral
17503,Krystal Myth,Egypt deserves to win their case against corpo...,negative


## Handle missing data

In [3]:
na_rows = df[df.isna().any(axis=1)]
print(na_rows)

      Author                                            Comment   
943      NaN  I saw the movie and found the plot lacking so ...  \
12246    NaN  ANOTHER thing that disney screwed up in star w...   

      sentiment_label  
943           neutral  
12246        negative  


In [4]:
df.dropna(inplace=True)
na_rows = df[df.isna().any(axis=1)]
print(na_rows)
## no na values left

Empty DataFrame
Columns: [Author, Comment, sentiment_label]
Index: []


In [5]:
df['sentiment_label'].value_counts()

sentiment_label
negative    9140
positive    4602
neutral     3761
Name: count, dtype: int64

## Data Cleaning

In [6]:
import html
 
# remove HTML entity codes. 
def remove_html_codes(text):
    decoded_text = html.unescape(text)
    return decoded_text

df['Comment'] = df['Comment'].apply(remove_html_codes)

In [7]:
# convert text to lower case
df['Comment'] = df['Comment'].str.lower()
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,my friends loved it and agreed with everything...,positive
1,Amadanman85,i vow never ever too see this movie.,negative
2,A Chambers,the real world.... a thought experiment......🇬🇧,neutral
3,Irisdew,i really enjoyed the movie and didn't find it ...,positive
4,Ben,i hope she’s joking. it’s a movie about barbie...,positive
...,...,...,...
17500,Daz Hatz,it makes you want to puke doesn't it.,negative
17501,4EVERSEEKING WISDOM,if cleopatra was supposed to be so beautiful t...,negative
17502,Senni,"what a time we live in when the ""asterix & obe...",neutral
17503,Krystal Myth,egypt deserves to win their case against corpo...,negative


By taking a look at the data, we found some special cases to deal with
1. url links
2. emojis
3. slangs


In [8]:
# remove url links
def remove_links(text):
    # match strings starting with 'http'
    text = re.sub(r'http\S+', '', text)
    # match strings start with '<' and end with '>'
    text = re.sub(r'<.*?>', '', text)
    return text

df['Comment'] = df['Comment'].apply(remove_links)
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,my friends loved it and agreed with everything...,positive
1,Amadanman85,i vow never ever too see this movie.,negative
2,A Chambers,the real world.... a thought experiment......🇬🇧,neutral
3,Irisdew,i really enjoyed the movie and didn't find it ...,positive
4,Ben,i hope she’s joking. it’s a movie about barbie...,positive
...,...,...,...
17500,Daz Hatz,it makes you want to puke doesn't it.,negative
17501,4EVERSEEKING WISDOM,if cleopatra was supposed to be so beautiful t...,negative
17502,Senni,"what a time we live in when the ""asterix & obe...",neutral
17503,Krystal Myth,egypt deserves to win their case against corpo...,negative


In [9]:
# read the emojis.txt file
with open('emojis.txt', 'r', encoding='utf-8') as file:
    emoji_content = file.read()

# create emoji dictionary from emojis.txt file
emoji_dict = {}

for line in emoji_content.strip().split("\n"):
    emoji, meaning = line.split(": ", 1)
    emoji_dict[emoji] = meaning

print(emoji_dict)

{'😂': 'Extreme happiness, laughter', '❤️': 'Love', '🤣': 'Hysterical laughter', '👍': 'Well done, good job, approval', '😭': 'Uncontrollable sadness, joy', '🙏': 'Prayer, thank you, high five', '😘': 'Kiss, love', '🥰': 'Love, affection', '😍': 'Love, adoration', '😊': 'Positive, happy', '🎉': 'Celebration, congratulations', '😁': 'Glowing, beaming, happy', '💕': 'Love is in the air', '🥺': 'Adoration, bashful, pleading', '😅': 'Relief, nerves, excitement', '🔥': 'Hot, excellent', '☺️': 'Happy, positive', '🤦': 'Frustrated, dumbfounded', '♥️': 'Love', '🤷': 'Indifference, unknowing', '🙄': 'Sarcasm, boredom', '😆': 'Excitement, laughter, joy', '🤗': 'Hugging (love and support), jazz hands (enthusiasm)', '😉': 'Joking, cheeky', '🎂': 'Celebration, birthday', '🤔': 'Ponder, question', '👏': 'Applause', '🙂': 'Happy (sometimes ironic)', '😳': 'Embarrassed, surprise, flattered', '🥳': 'Celebration, joy', '😎': 'Cool, confident', '👌': 'Okay, approval, correct', '💜': 'Love', '😔': 'Reflective, remorseful', '💪': 'Streng

In [10]:
# use python emoji library to deal with emoji
# reference: https://carpedm20.github.io/emoji/docs/index.html
# the converted text is sth like :smiley_face:
# need to remove ':' and replace '_' with ' '

def convert_emojis(text):
    for em, meaning in emoji_dict.items():
        text = text.replace(em, meaning)
    return emoji_lib.demojize(text)

df['Comment'] = df['Comment'].apply(convert_emojis)
df


Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,my friends loved it and agreed with everything...,positive
1,Amadanman85,i vow never ever too see this movie.,negative
2,A Chambers,the real world.... a thought experiment......:...,neutral
3,Irisdew,i really enjoyed the movie and didn't find it ...,positive
4,Ben,i hope she’s joking. it’s a movie about barbie...,positive
...,...,...,...
17500,Daz Hatz,it makes you want to puke doesn't it.,negative
17501,4EVERSEEKING WISDOM,if cleopatra was supposed to be so beautiful t...,negative
17502,Senni,"what a time we live in when the ""asterix & obe...",neutral
17503,Krystal Myth,egypt deserves to win their case against corpo...,negative


## Data preprocessing

In [11]:
def preprocess(text):
    # tokenization
    tokens = word_tokenize(text)

    # stop words removal
    stop_words_removed = [word for word in tokens if word not in stopwords.words('english')]

    # remove punctuation
    punctuation_removed = [word for word in stop_words_removed if word not in list(string.punctuation)]

    # lemmatization
    lemmatized_text = [WordNetLemmatizer().lemmatize(word) for word in punctuation_removed]

    return ' '.join(lemmatized_text)

df['Comment'] = df['Comment'].apply(preprocess)
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,friend loved agreed everything movie trying sa...,positive
1,Amadanman85,vow never ever see movie,negative
2,A Chambers,real world .... thought experiment ...... Unit...,neutral
3,Irisdew,really enjoyed movie n't find overly feminist ...,positive
4,Ben,hope ’ joking ’ movie barbie ’ woman highlight...,positive
...,...,...,...
17500,Daz Hatz,make want puke n't,negative
17501,4EVERSEEKING WISDOM,cleopatra supposed beautiful pick ugly actress,negative
17502,Senni,time live `` asterix obelix '' depicted histor...,neutral
17503,Krystal Myth,egypt deserves win case corporate netflix woul...,negative


In [12]:
# slangs dictionary
# reference: https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing
slang_dict = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [13]:
# deal with slangs
def convert_slang(text):
    words = text.split()
    corrected_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(corrected_words)

df['Comment'] = df['Comment'].apply(convert_slang)
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,friend loved agreed everything movie trying sa...,positive
1,Amadanman85,vow never ever see movie,negative
2,A Chambers,real world .... thought experiment ...... Unit...,neutral
3,Irisdew,really enjoyed movie n't find overly feminist ...,positive
4,Ben,hope ’ joking ’ movie barbie ’ woman highlight...,positive
...,...,...,...
17500,Daz Hatz,make want puke n't,negative
17501,4EVERSEEKING WISDOM,cleopatra supposed beautiful pick ugly actress,negative
17502,Senni,time live `` asterix obelix '' depicted histor...,neutral
17503,Krystal Myth,egypt deserves win case corporate netflix woul...,negative


In [14]:
# remove special characters, return only letters, numbers and whitespaces
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
df['Comment'] = df['Comment'].apply(remove_special_characters)
df

Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,friend loved agreed everything movie trying sa...,positive
1,Amadanman85,vow never ever see movie,negative
2,A Chambers,real world thought experiment UnitedKingdom,neutral
3,Irisdew,really enjoyed movie nt find overly feminist ...,positive
4,Ben,hope joking movie barbie woman highlight is...,positive
...,...,...,...
17500,Daz Hatz,make want puke nt,negative
17501,4EVERSEEKING WISDOM,cleopatra supposed beautiful pick ugly actress,negative
17502,Senni,time live asterix obelix depicted history ac...,neutral
17503,Krystal Myth,egypt deserves win case corporate netflix woul...,negative


In [15]:
# filter comments between 10 and 500 chars in length
df['comment_length'] = df['Comment'].apply(len)

df.drop(df[(df['comment_length'] < 10) | (df['comment_length'] > 500)].index, inplace=True)

df.drop(columns=['comment_length'], inplace=True)

df


Unnamed: 0,Author,Comment,sentiment_label
0,Unhappy Bacon,friend loved agreed everything movie trying sa...,positive
1,Amadanman85,vow never ever see movie,negative
2,A Chambers,real world thought experiment UnitedKingdom,neutral
4,Ben,hope joking movie barbie woman highlight is...,positive
5,Ordinary Citizen,movie sound like social psyop s designed insta...,negative
...,...,...,...
17499,magnus horus,video entertaining well made cathartic listen...,positive
17500,Daz Hatz,make want puke nt,negative
17501,4EVERSEEKING WISDOM,cleopatra supposed beautiful pick ugly actress,negative
17502,Senni,time live asterix obelix depicted history ac...,neutral


In [16]:
df['sentiment_label'].value_counts()

sentiment_label
negative    8545
positive    4269
neutral     3388
Name: count, dtype: int64

In [17]:
df_temp = df.copy()

In [18]:
df = df_temp

In [19]:
# Assuming df is your dataframe

# Under-sampling negative class
negative_sampled = df[df['sentiment_label'] == 'negative'].sample(4000)

# Getting positive class
positive = df[df['sentiment_label'] == 'positive'].sample(3500)

# Getting neutral class
neutral_sampled = df[df['sentiment_label'] == 'neutral']

# Combining all
balanced_df = pd.concat([negative_sampled, neutral_sampled, positive])


In [20]:
df = balanced_df

In [21]:
df['sentiment_label'].value_counts()

sentiment_label
negative    4000
positive    3500
neutral     3388
Name: count, dtype: int64

# FEATURE ENGINEERING

In [22]:
df.to_csv('df_cleaned_new.csv', index=False)


In [23]:
df['processed_comment'] = df['Comment']
df['labels'] = df['sentiment_label']

In [24]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80-20 split)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=11, stratify=df['labels'])


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_comment'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['processed_comment'])


In [26]:
import gensim.downloader as api
import numpy as np

glove_model = api.load("glove-wiki-gigaword-100")


In [27]:
def get_glove_embedding(comment):
    words = comment.split()
    embeddings = [glove_model[word] for word in words if word in glove_model.key_to_index]
    
    if not embeddings:
        return np.zeros(glove_model.vector_size)
    
    return np.mean(embeddings, axis=0)

X_train_glove = np.array([get_glove_embedding(comment) for comment in train_data['processed_comment']])
X_test_glove = np.array([get_glove_embedding(comment) for comment in test_data['processed_comment']])


In [28]:
# !pip install transformers

In [29]:
from transformers import BertTokenizer, BertModel
import torch



# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")


# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True).to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [30]:
def get_bert_embedding(comment):
    inputs = bert_tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        return_tensors='pt',
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True
    )
    
    # Move input tensors to the GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

    # Extract the [CLS] token's embedding and move it back to the CPU
    cls_embedding = outputs['last_hidden_state'][:, 0, :].squeeze().cpu().numpy()
    return cls_embedding


X_train_bert = np.array([get_bert_embedding(comment) for comment in train_data['processed_comment']])
X_test_bert = np.array([get_bert_embedding(comment) for comment in test_data['processed_comment']])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [31]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (TF-IDF representations)
svm_classifier.fit(X_train_tfidf, train_data['labels'])


In [32]:
# Predict sentiments for training and testing data
train_predictions = svm_classifier.predict(X_train_tfidf)
test_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate the classifier's performance
train_accuracy = accuracy_score(train_data['labels'], train_predictions)
test_accuracy = accuracy_score(test_data['labels'], test_predictions)
train_report = classification_report(train_data['labels'], train_predictions)
test_report = classification_report(test_data['labels'], test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)
print("\nTraining Classification Report:\n", train_report)
print("\nTesting Classification Report:\n", test_report)


Training Accuracy: 0.8408725602755454
Testing Accuracy: 0.6515151515151515

Training Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.86      0.84      3200
     neutral       0.81      0.81      0.81      2710
    positive       0.89      0.85      0.87      2800

    accuracy                           0.84      8710
   macro avg       0.84      0.84      0.84      8710
weighted avg       0.84      0.84      0.84      8710


Testing Classification Report:
               precision    recall  f1-score   support

    negative       0.65      0.68      0.66       800
     neutral       0.57      0.60      0.58       678
    positive       0.74      0.68      0.71       700

    accuracy                           0.65      2178
   macro avg       0.65      0.65      0.65      2178
weighted avg       0.66      0.65      0.65      2178



In [33]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
svm_classifier_glove = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (GloVe representations)
svm_classifier_glove.fit(X_train_glove, train_data['labels'])


In [34]:
# Predict sentiments for training and testing data
train_predictions_glove = svm_classifier_glove.predict(X_train_glove)
test_predictions_glove = svm_classifier_glove.predict(X_test_glove)

# Evaluate the classifier's performance
train_accuracy_glove = accuracy_score(train_data['labels'], train_predictions_glove)
test_accuracy_glove = accuracy_score(test_data['labels'], test_predictions_glove)
train_report_glove = classification_report(train_data['labels'], train_predictions_glove)
test_report_glove = classification_report(test_data['labels'], test_predictions_glove)

print("Training Accuracy (GloVe):", train_accuracy_glove)
print("Testing Accuracy (GloVe):", test_accuracy_glove)
print("\nTraining Classification Report (GloVe):\n", train_report_glove)
print("\nTesting Classification Report (GloVe):\n", test_report_glove)


Training Accuracy (GloVe): 0.6420206659012629
Testing Accuracy (GloVe): 0.6221303948576676

Training Classification Report (GloVe):
               precision    recall  f1-score   support

    negative       0.65      0.71      0.68      3200
     neutral       0.60      0.55      0.57      2710
    positive       0.67      0.66      0.67      2800

    accuracy                           0.64      8710
   macro avg       0.64      0.64      0.64      8710
weighted avg       0.64      0.64      0.64      8710


Testing Classification Report (GloVe):
               precision    recall  f1-score   support

    negative       0.63      0.69      0.66       800
     neutral       0.58      0.52      0.55       678
    positive       0.65      0.64      0.65       700

    accuracy                           0.62      2178
   macro avg       0.62      0.62      0.62      2178
weighted avg       0.62      0.62      0.62      2178



In [35]:
# Initialize another SVM classifier for BERT
svm_classifier_bert = SVC(kernel='linear', probability=True)

# Train the classifier using the training data (BERT representations)
svm_classifier_bert.fit(X_train_bert, train_data['labels'])  # Note: Using train_data since we took a subset for BERT


In [36]:
# Predict sentiments for training and testing data
train_predictions_bert = svm_classifier_bert.predict(X_train_bert)
test_predictions_bert = svm_classifier_bert.predict(X_test_bert)

# Evaluate the classifier's performance
train_accuracy_bert = accuracy_score(train_data['labels'], train_predictions_bert)
test_accuracy_bert = accuracy_score(test_data['labels'], test_predictions_bert)
train_report_bert = classification_report(train_data['labels'], train_predictions_bert)
test_report_bert = classification_report(test_data['labels'], test_predictions_bert)

print("Training Accuracy (BERT):", train_accuracy_bert)
print("Testing Accuracy (BERT):", test_accuracy_bert)
print("\nTraining Classification Report (BERT):\n", train_report_bert)
print("\nTesting Classification Report (BERT):\n", test_report_bert)


Training Accuracy (BERT): 0.7451205510907003
Testing Accuracy (BERT): 0.6253443526170799

Training Classification Report (BERT):
               precision    recall  f1-score   support

    negative       0.74      0.78      0.76      3200
     neutral       0.73      0.69      0.71      2710
    positive       0.77      0.76      0.77      2800

    accuracy                           0.75      8710
   macro avg       0.75      0.74      0.74      8710
weighted avg       0.75      0.75      0.74      8710


Testing Classification Report (BERT):
               precision    recall  f1-score   support

    negative       0.65      0.69      0.67       800
     neutral       0.56      0.51      0.54       678
    positive       0.65      0.66      0.66       700

    accuracy                           0.63      2178
   macro avg       0.62      0.62      0.62      2178
weighted avg       0.62      0.63      0.62      2178



In [37]:
# from joblib import dump

# # Save the model to a file
# dump(svm_classifier_bert, 'svm_classifier_bert.joblib')


In [38]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier using the training data (TF-IDF representations)
nb_classifier.fit(X_train_tfidf, train_data['labels'])

# Predict sentiments for training and testing data
train_predictions_nb = nb_classifier.predict(X_train_tfidf)
test_predictions_nb = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier's performance
train_accuracy_nb = accuracy_score(train_data['labels'], train_predictions_nb)
test_accuracy_nb = accuracy_score(test_data['labels'], test_predictions_nb)
train_report_nb = classification_report(train_data['labels'], train_predictions_nb)
test_report_nb = classification_report(test_data['labels'], test_predictions_nb)

print("NB Training Accuracy:", train_accuracy_nb)
print("NB Testing Accuracy:", test_accuracy_nb)
print("\nNB Training Classification Report:\n", train_report_nb)
print("\nNB Testing Classification Report:\n", test_report_nb)


NB Training Accuracy: 0.7690011481056257
NB Testing Accuracy: 0.6345270890725436

NB Training Classification Report:
               precision    recall  f1-score   support

    negative       0.72      0.86      0.79      3200
     neutral       0.80      0.62      0.70      2710
    positive       0.81      0.81      0.81      2800

    accuracy                           0.77      8710
   macro avg       0.78      0.76      0.76      8710
weighted avg       0.78      0.77      0.77      8710


NB Testing Classification Report:
               precision    recall  f1-score   support

    negative       0.60      0.79      0.68       800
     neutral       0.63      0.40      0.49       678
    positive       0.68      0.69      0.68       700

    accuracy                           0.63      2178
   macro avg       0.64      0.62      0.62      2178
weighted avg       0.64      0.63      0.62      2178



In [39]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the classifier using the training data (TF-IDF representations)
dt_classifier.fit(X_train_tfidf, train_data['labels'])

# Predict sentiments for training and testing data
train_predictions_dt = dt_classifier.predict(X_train_tfidf)
test_predictions_dt = dt_classifier.predict(X_test_tfidf)

# Evaluate the classifier's performance
train_accuracy_dt = accuracy_score(train_data['labels'], train_predictions_dt)
test_accuracy_dt = accuracy_score(test_data['labels'], test_predictions_dt)
train_report_dt = classification_report(train_data['labels'], train_predictions_dt)
test_report_dt = classification_report(test_data['labels'], test_predictions_dt)

print("DT Training Accuracy:", train_accuracy_dt)
print("DT Testing Accuracy:", test_accuracy_dt)
print("\nDT Training Classification Report:\n", train_report_dt)
print("\nDT Testing Classification Report:\n", test_report_dt)


DT Training Accuracy: 0.9986222732491389
DT Testing Accuracy: 0.5532598714416896

DT Training Classification Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00      3200
     neutral       1.00      1.00      1.00      2710
    positive       1.00      1.00      1.00      2800

    accuracy                           1.00      8710
   macro avg       1.00      1.00      1.00      8710
weighted avg       1.00      1.00      1.00      8710


DT Testing Classification Report:
               precision    recall  f1-score   support

    negative       0.56      0.53      0.54       800
     neutral       0.49      0.51      0.50       678
    positive       0.61      0.62      0.62       700

    accuracy                           0.55      2178
   macro avg       0.55      0.55      0.55      2178
weighted avg       0.55      0.55      0.55      2178



In [43]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier with hyperparameters to reduce overfitting
dt_classifier = DecisionTreeClassifier(max_depth=15, min_samples_split=10, min_samples_leaf=5)

# Train the classifier using the training data (TF-IDF representations)
dt_classifier.fit(X_train_tfidf, train_data['labels'])

# Predict sentiments for training and testing data
train_predictions_dt = dt_classifier.predict(X_train_tfidf)
test_predictions_dt = dt_classifier.predict(X_test_tfidf)

# Evaluate the classifier's performance
train_accuracy_dt = accuracy_score(train_data['labels'], train_predictions_dt)
test_accuracy_dt = accuracy_score(test_data['labels'], test_predictions_dt)
train_report_dt = classification_report(train_data['labels'], train_predictions_dt)
test_report_dt = classification_report(test_data['labels'], test_predictions_dt)

print("DT Training Accuracy:", train_accuracy_dt)
print("DT Testing Accuracy:", test_accuracy_dt)
print("\nDT Training Classification Report:\n", train_report_dt)
print("\nDT Testing Classification Report:\n", test_report_dt)

DT Training Accuracy: 0.557060849598163
DT Testing Accuracy: 0.5009182736455464

DT Training Classification Report:
               precision    recall  f1-score   support

    negative       0.59      0.45      0.51      3200
     neutral       0.44      0.76      0.56      2710
    positive       0.88      0.48      0.62      2800

    accuracy                           0.56      8710
   macro avg       0.63      0.56      0.56      8710
weighted avg       0.63      0.56      0.56      8710


DT Testing Classification Report:
               precision    recall  f1-score   support

    negative       0.51      0.40      0.45       800
     neutral       0.42      0.73      0.53       678
    positive       0.74      0.40      0.52       700

    accuracy                           0.50      2178
   macro avg       0.56      0.51      0.50      2178
weighted avg       0.56      0.50      0.50      2178

