In [1]:
#Import Necessary Modules
import nltk
import gensim
import pandas as pd
import re
import pickle
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [2]:
#Load Dataset
Google_df = pd.read_csv('Dataset/Companies/google.csv')
Google_df.head()

Unnamed: 0,Datetime,Text,Username
0,Dec-2022,@YaYaOregon Google gets a lot of work with som...,Canaansdad1987
1,Dec-2022,SOCIALIZED MEDIA: HUNDREDS OF ISRAEL’S UNIT 82...,Tibou33969029
2,Dec-2022,@kennyraytheman @TimRunsHisMouth If you google...,kikjuicer9
3,Dec-2022,@jrozner at least she didn’t google it on her ...,oota
4,Dec-2022,@TAILZZZZZZ @stock_con_ @GeeScottSr @RNCResear...,ThotBoutit


In [3]:
Google_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2992 entries, 0 to 2991
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2992 non-null   object
 1   Text      2992 non-null   object
 2   Username  2992 non-null   object
dtypes: object(3)
memory usage: 70.2+ KB


In [4]:
#Drop Duplicate Rows
Google_df = Google_df.drop_duplicates()
Google_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2985 entries, 0 to 2991
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2985 non-null   object
 1   Text      2985 non-null   object
 2   Username  2985 non-null   object
dtypes: object(3)
memory usage: 93.3+ KB


In [5]:
#Filter Data by Different Time Periods
Google_Dec22 = Google_df[Google_df['Datetime'] == 'Dec-2022']
Google_Jan23 = Google_df[Google_df['Datetime'] == 'Jan-2023']
Google_Feb23 = Google_df[Google_df['Datetime'] == 'Feb-2023']

# Tweets PreProcessing

In [6]:
#Remove URLs
Google_Dec22 = Google_Dec22['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Google_Jan23 = Google_Jan23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Google_Feb23 = Google_Feb23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [7]:
#Remove HTML Reference Characters
Google_Dec22 = Google_Dec22.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Google_Jan23 = Google_Jan23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Google_Feb23 = Google_Feb23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [8]:
#Remove Twitter Handles
Google_Dec22 = Google_Dec22.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Google_Jan23 = Google_Jan23.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Google_Feb23 = Google_Feb23.apply(lambda x: re.sub(r'@[^ ]+', '', x))

In [9]:
#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

Google_Dec22 = Google_Dec22.apply(convert_emojis_to_word)
Google_Jan23 = Google_Jan23.apply(convert_emojis_to_word)
Google_Feb23 = Google_Feb23.apply(convert_emojis_to_word)

In [10]:
#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

Google_Dec22 = Google_Dec22.apply(convert_emoticons)
Google_Jan23 = Google_Jan23.apply(convert_emoticons)
Google_Feb23 = Google_Feb23.apply(convert_emoticons)

# Perform Naive Bayes Classification

In [11]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#For Google December 2022
test_df1 = Google_Dec22.to_frame()
for index,row in test_df1.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df1.at[index,'sentiment'] = predicted_label
    
#For Google January 2023
test_df2 = Google_Jan23.to_frame()
for index,row in test_df2.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df2.at[index,'sentiment'] = predicted_label
    
#For Google Februrary 2023
test_df3 = Google_Feb23.to_frame()
for index,row in test_df3.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df3.at[index,'sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df1.to_csv("Dataset/Sentiments Predicted/GoogleDec22_Sentiments.csv")
test_df2.to_csv("Dataset/Sentiments Predicted/GoogleJan23_Sentiments.csv")
test_df3.to_csv("Dataset/Sentiments Predicted/GoogleFeb23_Sentiments.csv")

# Analysis of Output

In [12]:
#Load Dataset
Google_Dec22 = pd.read_csv("Dataset/Sentiments Predicted/GoogleDec22_Sentiments.csv", encoding='latin-1')
Google_Jan23 = pd.read_csv("Dataset/Sentiments Predicted/GoogleJan23_Sentiments.csv", encoding='latin-1')
Google_Feb23 = pd.read_csv("Dataset/Sentiments Predicted/GoogleFeb23_Sentiments.csv", encoding='latin-1')

#Used to Store the Labels & Total Number of Tweets
labels_Google_Dec22 = []
count_Google_Dec22 = 0
labels_Google_Jan23 = []
count_Google_Jan23 = 0
labels_Google_Feb23 = []
count_Google_Feb23 = 0

#For Google December 2022
for index,row in Google_Dec22.iterrows():
    labels_Google_Dec22.append(row['sentiment'])
    count_Google_Dec22 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Google_Dec22.count('pos')
num_negatives = labels_Google_Dec22.count('neg')
num_neutral = labels_Google_Dec22.count('neu')

#Print the Results
print('Google December 2022')
print('Positive Labels:', num_positives/count_Google_Dec22)
print('Negative Labels:', num_negatives/count_Google_Dec22)
print('Neutral Labels:', num_neutral/count_Google_Dec22)


#For Google January 2023
for index,row in Google_Jan23.iterrows():
    labels_Google_Jan23.append(row['sentiment'])
    count_Google_Jan23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Google_Jan23.count('pos')
num_negatives = labels_Google_Jan23.count('neg')
num_neutral = labels_Google_Jan23.count('neu')

#Print the Results
print('--------------------')
print('Google January 2023')
print('Positive Labels:', num_positives/count_Google_Jan23)
print('Negative Labels:', num_negatives/count_Google_Jan23)
print('Neutral Labels:', num_neutral/count_Google_Jan23)

#For Google February 2023
for index,row in Google_Feb23.iterrows():
    labels_Google_Feb23.append(row['sentiment'])
    count_Google_Feb23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Google_Feb23.count('pos')
num_negatives = labels_Google_Feb23.count('neg')
num_neutral = labels_Google_Feb23.count('neu')

#Print the Results
print('--------------------')
print('Google February 2023')
print('Positive Labels:', num_positives/count_Google_Feb23)
print('Negative Labels:', num_negatives/count_Google_Feb23)
print('Neutral Labels:', num_neutral/count_Google_Feb23)

Google December 2022
Positive Labels: 0.17386934673366833
Negative Labels: 0.7819095477386935
Neutral Labels: 0.044221105527638194
--------------------
Google January 2023
Positive Labels: 0.16933867735470942
Negative Labels: 0.7905811623246493
Neutral Labels: 0.04008016032064128
--------------------
Google February 2023
Positive Labels: 0.1774193548387097
Negative Labels: 0.78125
Neutral Labels: 0.04133064516129032
