# Pip Install

In [1]:
pip install numpy pandas torch matplotlib seaborn emoji nltk

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


# Loading Data and Packages


In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_colwidth", None)


In [3]:
real_train = pd.read_csv("https://raw.githubusercontent.com/rrandev03/NLP_Final/refs/heads/main/100_data/102_processed/full_cleaned_df.csv", header = 0)
real_train.head(3)

Unnamed: 0.1,Unnamed: 0,Tweet_Content,Labels,Cleaned_Tweets,Sentiment
0,0,If you ask me $NIO ET7 is on elf the best looking cars all around the world! I really love this electric vehicle! https://t.co/5LH739LEWj,POSITIVE,if you ask me nio et7 is on elf the best looking cars all around the world i really love this electric vehicle,2
1,1,"Here are the annual UK vehicle registration % for electric vehicles. In December, 1 in 6 cars registered were fully electric - significantly higher than diesel. @CrowdCharge @DriveElectricUK #electricvehicles #BuildBackBetter \n [Thanks @SMMT for data] https://t.co/VZsp7Eusv5",POSITIVE,here are the annual uk vehicle registration for electric vehicles in december 1 in 6 cars registered were fully electric significantly higher than diesel electricvehicles buildbackbetter thanks for data,2
2,2,"Robert Downey Jr. turns his classic cars into electric vehicles as part of a new documentary series, ‘Downey's Dream Cars’ https://t.co/NGv3ekJb9r",NEUTRAL,robert downey jr turns his classic cars into electric vehicles as part of a new documentary series downeys dream cars,1


In [4]:
#check for NA
print(real_train[real_train["Sentiment"].isna()])
#check distribution
positive = (real_train["Sentiment"] == 2).sum()
neutral = (real_train["Sentiment"] == 1).sum()
negative = (real_train["Sentiment"] == 0).sum()
print(positive, neutral, negative)

Empty DataFrame
Columns: [Unnamed: 0, Tweet_Content, Labels, Cleaned_Tweets, Sentiment]
Index: []
171 60 72


# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    real_train['Cleaned_Tweets'], real_train['Sentiment'], test_size=0.2, random_state=42
)

# TF-IDF Tokenisation

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)
X_test_cv = clf.transform(X_test)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

# Fit Complement NB

In [7]:
from sklearn.naive_bayes import ComplementNB

model = ComplementNB()
model.fit(X_train_tf, y_train)
result = model.predict(X_test_tf)

# Results

In [8]:
from sklearn.metrics import classification_report

print('\tClassification Report for (Complement) Naive Bayes:\n\n',
      classification_report(y_test ,result, target_names=['Negative', 'Neutral', 'Positive']))

	Classification Report for (Complement) Naive Bayes:

               precision    recall  f1-score   support

    Negative       0.75      0.19      0.30        16
     Neutral       0.50      0.07      0.12        14
    Positive       0.55      0.97      0.70        31

    accuracy                           0.56        61
   macro avg       0.60      0.41      0.37        61
weighted avg       0.59      0.56      0.46        61



# Run on rest of tweets

In [15]:
rest = pd.read_csv("https://raw.githubusercontent.com/rrandev03/NLP_Final/main/100_data/101_raw/combined_file.csv")
#rest = pd.read_csv("https://raw.githubusercontent.com/rrandev03/NLP_Final/refs/heads/main/100_data/102_processed/combined_file.csv", header = 0)
rest = rest.dropna()
rest.head(5)
#columns = ["UTC_Time", "Tweet_Content", " Label ", "Labels", "LABELS"]
#rest = rest[columns]

Unnamed: 0,UTC_Time,Tweet_Content
0,2023-01-01 01:56:24+00:00,"Free unlimited data, calls, text with an ID card and EBT card in America. What’s next, free electric vehicles for welfare recipients?\n\nAmerica is being dismantled from within. https://t.co/SQtcFlTBBM"
1,2023-01-01 17:06:22+00:00,"👉 10 years of EV growth\n\nIn 2011, around 55,000 electric vehicles (EVs) were sold around the world. 10 years later in 2021, that figure had grown close to 7 million vehicles. https://t.co/cKBgx3jW8r"
2,2023-01-01 13:42:00+00:00,The world’s first commercial solar electric vehicles are hitting the U.S. and European markets in the next few years. That's right: cars powered by the sun are coming. https://t.co/9Dz2IgXQhN https://t.co/vTKUtsE654
3,2023-01-01 15:04:00+00:00,Nickel is a key component in most lithium-ion batteries used in electric vehicles. Watch the video to learn where the U.S. sources its nickel. https://t.co/iBPHoWG5Iy https://t.co/u6Q43RQiRM
4,2023-01-01 19:40:30+00:00,This experimental road is one very long charger for electric vehicles https://t.co/rq1P2OFYmd


## Cleaning the Tweets

In [16]:
import re, string
import emoji
import nltk

# Clean emojis from text
def strip_emoji(text):
    return emoji.replace_emoji(text, replace='')  # Remove emoji by replacing them with an empty string

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

def clean_text(text):
    """Combine all cleaning functions."""
    text = strip_emoji(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    return text

cleaned_tweets = rest['Tweet_Content'].apply(clean_text)

## Prediction

In [17]:
vectorized_tweets = clf.transform(cleaned_tweets)  # Transform with trained CountVectorizer
tfidf_tweets = tf_transformer.transform(vectorized_tweets)  # Transform with trained TfidfTransformer

In [18]:
predicted_sentiments = model.predict(tfidf_tweets)
rest['Predicted_Sentiment'] = ['Negative' if s == 0 else 'Neutral' if s == 1 else 'Positive' for s in predicted_sentiments]

In [21]:
rest["Predicted_Sentiment"].value_counts()

Unnamed: 0_level_0,count
Predicted_Sentiment,Unnamed: 1_level_1
Positive,78385
Negative,3498
Neutral,1532


In [None]:
rest.to_csv("../100_data/102_processed/naive_predictions.csv", index=False)