In [1]:
#Import Necessary Modules
import nltk
import gensim
import pandas as pd
import re
import pickle
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [2]:
#Load Dataset
Apple_df = pd.read_csv('Dataset/Companies/apple.csv')
Apple_df.head()

Unnamed: 0,Datetime,Text,Username
0,Dec-2022,@dhoroiloh does this work with red apples,killizerk
1,Dec-2022,@Nothennyfr Buy Apple stock,Tone_Fusion
2,Dec-2022,@Nothennyfr Buy Apple Stock,benedictm
3,Dec-2022,Bro I just wanted my pickle and chocolate cove...,BreskiDaDon
4,Dec-2022,4. BLACK BIRD\n\nA complex crime drama that ne...,howatdk


In [3]:
Apple_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2982 entries, 0 to 2981
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2982 non-null   object
 1   Text      2982 non-null   object
 2   Username  2982 non-null   object
dtypes: object(3)
memory usage: 70.0+ KB


In [4]:
#Drop Duplicate Rows
Apple_df = Apple_df.drop_duplicates()
Apple_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2972 entries, 0 to 2981
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2972 non-null   object
 1   Text      2972 non-null   object
 2   Username  2972 non-null   object
dtypes: object(3)
memory usage: 92.9+ KB


In [5]:
#Filter Data by Different Time Periods
Apple_Dec22 = Apple_df[Apple_df['Datetime'] == 'Dec-2022']
Apple_Jan23 = Apple_df[Apple_df['Datetime'] == 'Jan-2023']
Apple_Feb23 = Apple_df[Apple_df['Datetime'] == 'Feb-2023']

# Tweets PreProcessing

In [6]:
#Remove URLs
Apple_Dec22 = Apple_Dec22['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Apple_Jan23 = Apple_Jan23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Apple_Feb23 = Apple_Feb23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [7]:
#Remove HTML Reference Characters
Apple_Dec22 = Apple_Dec22.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Apple_Jan23 = Apple_Jan23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Apple_Feb23 = Apple_Feb23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [8]:
#Remove Twitter Handles
Apple_Dec22 = Apple_Dec22.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Apple_Jan23 = Apple_Jan23.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Apple_Feb23 = Apple_Feb23.apply(lambda x: re.sub(r'@[^ ]+', '', x))

In [9]:
#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

Apple_Dec22 = Apple_Dec22.apply(convert_emojis_to_word)
Apple_Jan23 = Apple_Jan23.apply(convert_emojis_to_word)
Apple_Feb23 = Apple_Feb23.apply(convert_emojis_to_word)

In [10]:
#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

Apple_Dec22 = Apple_Dec22.apply(convert_emoticons)
Apple_Jan23 = Apple_Jan23.apply(convert_emoticons)
Apple_Feb23 = Apple_Feb23.apply(convert_emoticons)

# Training Dataset PreProcessing

In [11]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/labelled_dataset.csv', encoding='latin-1')

#Remove URLs
training_dataset = training_dataset['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

#Remove HTML Reference Characters
training_dataset = training_dataset.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

#Remove Twitter Handles
training_dataset = training_dataset.apply(lambda x: re.sub(r'@[^ ]+', '', x))

#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

training_dataset = training_dataset.apply(convert_emojis_to_word)

#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

training_dataset = training_dataset.apply(convert_emoticons)

#Export PreProcessed Dataset
training_dataset.to_csv('Dataset/Training/training_dataset.csv')

In [12]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')
training_dataset.head()

Unnamed: 0.1,Unnamed: 0,Text,Label
0,0,"Mr Brody, this is the Internet community, ful...",pos
1,1,. Infrastructure grew 125% in India in the fir...,pos
2,2,Labor board decision could force Google to neg...,neg
3,3,"Precisely. Equally and alternatively, is the...",neg
4,4,I clock that tea yesterday cardi was listenin...,neg


In [13]:
training_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  301 non-null    int64 
 1   Text        301 non-null    object
 2   Label       301 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.2+ KB


In [14]:
#Drop Duplicate Rows
training_dataset = training_dataset.drop_duplicates()
training_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301 entries, 0 to 300
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  301 non-null    int64 
 1   Text        301 non-null    object
 2   Label       301 non-null    object
dtypes: int64(1), object(2)
memory usage: 9.4+ KB


# Perform Naive Bayes Classification

In [15]:
#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#For Apple December 2022
test_df1 = Apple_Dec22.to_frame()
for index,row in test_df1.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df1.at[index,'sentiment'] = predicted_label
    
#For Apple January 2023
test_df2 = Apple_Jan23.to_frame()
for index,row in test_df2.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df2.at[index,'sentiment'] = predicted_label
    
#For Apple Februrary 2023
test_df3 = Apple_Feb23.to_frame()
for index,row in test_df3.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df3.at[index,'sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df1.to_csv("Dataset/Sentiments Predicted/AppleDec22_Sentiments.csv")
test_df2.to_csv("Dataset/Sentiments Predicted/AppleJan23_Sentiments.csv")
test_df3.to_csv("Dataset/Sentiments Predicted/AppleFeb23_Sentiments.csv")

# Analysis of Output

In [16]:
#Load Dataset
Apple_Dec22 = pd.read_csv("Dataset/Sentiments Predicted/AppleDec22_Sentiments.csv", encoding='latin-1')
Apple_Jan23 = pd.read_csv("Dataset/Sentiments Predicted/AppleJan23_Sentiments.csv", encoding='latin-1')
Apple_Feb23 = pd.read_csv("Dataset/Sentiments Predicted/AppleFeb23_Sentiments.csv", encoding='latin-1')

#Used to Store the Labels & Total Number of Tweets
labels_Apple_Dec22 = []
count_Apple_Dec22 = 0
labels_Apple_Jan23 = []
count_Apple_Jan23 = 0
labels_Apple_Feb23 = []
count_Apple_Feb23 = 0

#For Apple December 2022
for index,row in Apple_Dec22.iterrows():
    labels_Apple_Dec22.append(row['sentiment'])
    count_Apple_Dec22 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Apple_Dec22.count('pos')
num_negatives = labels_Apple_Dec22.count('neg')
num_neutral = labels_Apple_Dec22.count('neu')

#Print the Results
print('Apple December 2022')
print('Positive Labels:', num_positives/count_Apple_Dec22)
print('Negative Labels:', num_negatives/count_Apple_Dec22)
print('Neutral Labels:', num_neutral/count_Apple_Dec22)


#For Apple January 2023
for index,row in Apple_Jan23.iterrows():
    labels_Apple_Jan23.append(row['sentiment'])
    count_Apple_Jan23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Apple_Jan23.count('pos')
num_negatives = labels_Apple_Jan23.count('neg')
num_neutral = labels_Apple_Jan23.count('neu')

#Print the Results
print('--------------------')
print('Apple January 2023')
print('Positive Labels:', num_positives/count_Apple_Jan23)
print('Negative Labels:', num_negatives/count_Apple_Jan23)
print('Neutral Labels:', num_neutral/count_Apple_Jan23)

#For Apple February 2023
for index,row in Apple_Feb23.iterrows():
    labels_Apple_Feb23.append(row['sentiment'])
    count_Apple_Feb23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Apple_Feb23.count('pos')
num_negatives = labels_Apple_Feb23.count('neg')
num_neutral = labels_Apple_Feb23.count('neu')

#Print the Results
print('--------------------')
print('Apple February 2023')
print('Positive Labels:', num_positives/count_Apple_Feb23)
print('Negative Labels:', num_negatives/count_Apple_Feb23)
print('Neutral Labels:', num_neutral/count_Apple_Feb23)

Apple December 2022
Positive Labels: 0.3259109311740891
Negative Labels: 0.604251012145749
Neutral Labels: 0.06983805668016195
--------------------
Apple January 2023
Positive Labels: 0.3474320241691843
Negative Labels: 0.5800604229607251
Neutral Labels: 0.07250755287009064
--------------------
Apple February 2023
Positive Labels: 0.33097880928355194
Negative Labels: 0.6316851664984864
Neutral Labels: 0.03733602421796166
