In [1]:
#Import Necessary Modules
import nltk
import gensim
import pandas as pd
import re
import pickle
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [2]:
#Load Dataset
IBM_df = pd.read_csv('Dataset/Companies/ibm.csv')
IBM_df.head()

Unnamed: 0,Datetime,Text,Username
0,Dec-2022,#IBMCorp $IBM Monthwise Daily High - Low Movem...,AbhipraGroup
1,Dec-2022,Technology wasn't advanced to help the people....,HealnLuvHistory
2,Dec-2022,@GilmanHill @Stephanie_Link You guys making a ...,christi10006880
3,Dec-2022,@GillianStaveley @CBD_COP15 @DenaKayeh 2/2..pr...,RebelPikeMike
4,Dec-2022,@Dave_M748 @PR0GRAMMERHUM0R The funny thing is...,nickoasdf1


In [3]:
IBM_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2347 non-null   object
 1   Text      2347 non-null   object
 2   Username  2347 non-null   object
dtypes: object(3)
memory usage: 55.1+ KB


In [4]:
#Drop Duplicate Rows
IBM_df = IBM_df.drop_duplicates()
IBM_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2322 entries, 0 to 2346
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2322 non-null   object
 1   Text      2322 non-null   object
 2   Username  2322 non-null   object
dtypes: object(3)
memory usage: 72.6+ KB


In [5]:
#Filter Data by Different Time Periods
IBM_Dec22 = IBM_df[IBM_df['Datetime'] == 'Dec-2022']
IBM_Jan23 = IBM_df[IBM_df['Datetime'] == 'Jan-2023']
IBM_Feb23 = IBM_df[IBM_df['Datetime'] == 'Feb-2023']

# Tweets PreProcessing

In [6]:
#Remove URLs
IBM_Dec22 = IBM_Dec22['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
IBM_Jan23 = IBM_Jan23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
IBM_Feb23 = IBM_Feb23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [7]:
#Remove HTML Reference Characters
IBM_Dec22 = IBM_Dec22.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
IBM_Jan23 = IBM_Jan23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
IBM_Feb23 = IBM_Feb23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [8]:
#Remove Twitter Handles
IBM_Dec22 = IBM_Dec22.apply(lambda x: re.sub(r'@[^ ]+', '', x))
IBM_Jan23 = IBM_Jan23.apply(lambda x: re.sub(r'@[^ ]+', '', x))
IBM_Feb23 = IBM_Feb23.apply(lambda x: re.sub(r'@[^ ]+', '', x))

In [9]:
#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

IBM_Dec22 = IBM_Dec22.apply(convert_emojis_to_word)
IBM_Jan23 = IBM_Jan23.apply(convert_emojis_to_word)
IBM_Feb23 = IBM_Feb23.apply(convert_emojis_to_word)

In [10]:
#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

IBM_Dec22 = IBM_Dec22.apply(convert_emoticons)
IBM_Jan23 = IBM_Jan23.apply(convert_emoticons)
IBM_Feb23 = IBM_Feb23.apply(convert_emoticons)

# Perform Naive Bayes Classification

In [11]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#For IBM December 2022
test_df1 = IBM_Dec22.to_frame()
for index,row in test_df1.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df1.at[index,'sentiment'] = predicted_label
    
#For IBM January 2023
test_df2 = IBM_Jan23.to_frame()
for index,row in test_df2.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df2.at[index,'sentiment'] = predicted_label
    
#For IBM Februrary 2023
test_df3 = IBM_Feb23.to_frame()
for index,row in test_df3.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df3.at[index,'sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df1.to_csv("Dataset/Sentiments Predicted/IBMDec22_Sentiments.csv")
test_df2.to_csv("Dataset/Sentiments Predicted/IBMJan23_Sentiments.csv")
test_df3.to_csv("Dataset/Sentiments Predicted/IBMFeb23_Sentiments.csv")

# Analysis of Output

In [12]:
#Load Dataset
IBM_Dec22 = pd.read_csv("Dataset/Sentiments Predicted/IBMDec22_Sentiments.csv", encoding='latin-1')
IBM_Jan23 = pd.read_csv("Dataset/Sentiments Predicted/IBMJan23_Sentiments.csv", encoding='latin-1')
IBM_Feb23 = pd.read_csv("Dataset/Sentiments Predicted/IBMFeb23_Sentiments.csv", encoding='latin-1')

#Used to Store the Labels & Total Number of Tweets
labels_IBM_Dec22 = []
count_IBM_Dec22 = 0
labels_IBM_Jan23 = []
count_IBM_Jan23 = 0
labels_IBM_Feb23 = []
count_IBM_Feb23 = 0

#For IBM December 2022
for index,row in IBM_Dec22.iterrows():
    labels_IBM_Dec22.append(row['sentiment'])
    count_IBM_Dec22 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_IBM_Dec22.count('pos')
num_negatives = labels_IBM_Dec22.count('neg')
num_neutral = labels_IBM_Dec22.count('neu')

#Print the Results
print('IBM December 2022')
print('Positive Labels:', num_positives/count_IBM_Dec22)
print('Negative Labels:', num_negatives/count_IBM_Dec22)
print('Neutral Labels:', num_neutral/count_IBM_Dec22)


#For IBM January 2023
for index,row in IBM_Jan23.iterrows():
    labels_IBM_Jan23.append(row['sentiment'])
    count_IBM_Jan23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_IBM_Jan23.count('pos')
num_negatives = labels_IBM_Jan23.count('neg')
num_neutral = labels_IBM_Jan23.count('neu')

#Print the Results
print('--------------------')
print('IBM January 2023')
print('Positive Labels:', num_positives/count_IBM_Jan23)
print('Negative Labels:', num_negatives/count_IBM_Jan23)
print('Neutral Labels:', num_neutral/count_IBM_Jan23)

#For IBM February 2023
for index,row in IBM_Feb23.iterrows():
    labels_IBM_Feb23.append(row['sentiment'])
    count_IBM_Feb23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_IBM_Feb23.count('pos')
num_negatives = labels_IBM_Feb23.count('neg')
num_neutral = labels_IBM_Feb23.count('neu')

#Print the Results
print('--------------------')
print('IBM February 2023')
print('Positive Labels:', num_positives/count_IBM_Feb23)
print('Negative Labels:', num_negatives/count_IBM_Feb23)
print('Neutral Labels:', num_neutral/count_IBM_Feb23)

IBM December 2022
Positive Labels: 0.23129251700680273
Negative Labels: 0.7108843537414966
Neutral Labels: 0.05782312925170068
--------------------
IBM January 2023
Positive Labels: 0.21428571428571427
Negative Labels: 0.648910411622276
Neutral Labels: 0.1368038740920097
--------------------
IBM February 2023
Positive Labels: 0.26762114537444937
Negative Labels: 0.6773127753303965
Neutral Labels: 0.05506607929515418
