# MICROSOFT

In [1]:
#Import Necessary Modules
import nltk
import gensim
import pandas as pd
import re
import pickle
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [2]:
#Load Dataset
Microsoft_df = pd.read_csv('Dataset/Companies/microsoft.csv')
Microsoft_df.head()

Unnamed: 0,Datetime,Text,Username
0,Dec-2022,SOCIALIZED MEDIA: HUNDREDS OF ISRAEL’S UNIT 82...,Tibou33969029
1,Dec-2022,RT HuffPostWomen: The Microsoft mogul gave $5 ...,GrowGirlathon
2,Dec-2022,The Microsoft mogul gave $5 billion to the Bil...,LinusAlso
3,Dec-2022,Microsoft SQL/C Developer at RBC\nCome Work wi...,zobjobsCA
4,Dec-2022,Large NYC taxi fleet looking to hire a SECRETA...,radio_rusrek


In [3]:
Microsoft_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2953 entries, 0 to 2952
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2953 non-null   object
 1   Text      2953 non-null   object
 2   Username  2953 non-null   object
dtypes: object(3)
memory usage: 69.3+ KB


In [4]:
#Drop Duplicate Rows
Microsoft_df = Microsoft_df.drop_duplicates()
Microsoft_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2937 entries, 0 to 2952
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2937 non-null   object
 1   Text      2937 non-null   object
 2   Username  2937 non-null   object
dtypes: object(3)
memory usage: 91.8+ KB


In [5]:
#Filter Data by Different Time Periods
Microsoft_Dec22 = Microsoft_df[Microsoft_df['Datetime'] == 'Dec-2022']
Microsoft_Jan23 = Microsoft_df[Microsoft_df['Datetime'] == 'Jan-2023']
Microsoft_Feb23 = Microsoft_df[Microsoft_df['Datetime'] == 'Feb-2023']

# Tweets PreProcessing

In [6]:
#Remove URLs
Microsoft_Dec22 = Microsoft_Dec22['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Microsoft_Jan23 = Microsoft_Jan23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Microsoft_Feb23 = Microsoft_Feb23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [7]:
#Remove HTML Reference Characters
Microsoft_Dec22 = Microsoft_Dec22.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Microsoft_Jan23 = Microsoft_Jan23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Microsoft_Feb23 = Microsoft_Feb23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [8]:
#Remove Twitter Handles
Microsoft_Dec22 = Microsoft_Dec22.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Microsoft_Jan23 = Microsoft_Jan23.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Microsoft_Feb23 = Microsoft_Feb23.apply(lambda x: re.sub(r'@[^ ]+', '', x))

In [9]:
#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

Microsoft_Dec22 = Microsoft_Dec22.apply(convert_emojis_to_word)
Microsoft_Jan23 = Microsoft_Jan23.apply(convert_emojis_to_word)
Microsoft_Feb23 = Microsoft_Feb23.apply(convert_emojis_to_word)

In [10]:
#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

Microsoft_Dec22 = Microsoft_Dec22.apply(convert_emoticons)
Microsoft_Jan23 = Microsoft_Jan23.apply(convert_emoticons)
Microsoft_Feb23 = Microsoft_Feb23.apply(convert_emoticons)

# Perform Naive Bayes Classification

In [11]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#For Microsoft December 2022
test_df1 = Microsoft_Dec22.to_frame()
for index,row in test_df1.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df1.at[index,'sentiment'] = predicted_label
    
#For Microsoft January 2023
test_df2 = Microsoft_Jan23.to_frame()
for index,row in test_df2.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df2.at[index,'sentiment'] = predicted_label
    
#For Microsoft Februrary 2023
test_df3 = Microsoft_Feb23.to_frame()
for index,row in test_df3.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df3.at[index,'sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df1.to_csv("Dataset/Sentiments Predicted/MicrosoftDec22_Sentiments.csv")
test_df2.to_csv("Dataset/Sentiments Predicted/MicrosoftJan23_Sentiments.csv")
test_df3.to_csv("Dataset/Sentiments Predicted/MicrosoftFeb23_Sentiments.csv")

# Analysis of Output

In [12]:
#Load Dataset
Microsoft_Dec22 = pd.read_csv("Dataset/Sentiments Predicted/MicrosoftDec22_Sentiments.csv", encoding='latin-1')
Microsoft_Jan23 = pd.read_csv("Dataset/Sentiments Predicted/MicrosoftJan23_Sentiments.csv", encoding='latin-1')
Microsoft_Feb23 = pd.read_csv("Dataset/Sentiments Predicted/MicrosoftFeb23_Sentiments.csv", encoding='latin-1')

#Used to Store the Labels & Total Number of Tweets
labels_Microsoft_Dec22 = []
count_Microsoft_Dec22 = 0
labels_Microsoft_Jan23 = []
count_Microsoft_Jan23 = 0
labels_Microsoft_Feb23 = []
count_Microsoft_Feb23 = 0

#For Microsoft December 2022
for index,row in Microsoft_Dec22.iterrows():
    labels_Microsoft_Dec22.append(row['sentiment'])
    count_Microsoft_Dec22 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Microsoft_Dec22.count('pos')
num_negatives = labels_Microsoft_Dec22.count('neg')
num_neutral = labels_Microsoft_Dec22.count('neu')

#Print the Results
print('Microsoft December 2022')
print('Positive Labels:', num_positives/count_Microsoft_Dec22)
print('Negative Labels:', num_negatives/count_Microsoft_Dec22)
print('Neutral Labels:', num_neutral/count_Microsoft_Dec22)


#For Microsoft January 2023
for index,row in Microsoft_Jan23.iterrows():
    labels_Microsoft_Jan23.append(row['sentiment'])
    count_Microsoft_Jan23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Microsoft_Jan23.count('pos')
num_negatives = labels_Microsoft_Jan23.count('neg')
num_neutral = labels_Microsoft_Jan23.count('neu')

#Print the Results
print('--------------------')
print('Microsoft January 2023')
print('Positive Labels:', num_positives/count_Microsoft_Jan23)
print('Negative Labels:', num_negatives/count_Microsoft_Jan23)
print('Neutral Labels:', num_neutral/count_Microsoft_Jan23)

#For Microsoft February 2023
for index,row in Microsoft_Feb23.iterrows():
    labels_Microsoft_Feb23.append(row['sentiment'])
    count_Microsoft_Feb23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Microsoft_Feb23.count('pos')
num_negatives = labels_Microsoft_Feb23.count('neg')
num_neutral = labels_Microsoft_Feb23.count('neu')

#Print the Results
print('--------------------')
print('Microsoft February 2023')
print('Positive Labels:', num_positives/count_Microsoft_Feb23)
print('Negative Labels:', num_negatives/count_Microsoft_Feb23)
print('Neutral Labels:', num_neutral/count_Microsoft_Feb23)

Microsoft December 2022
Positive Labels: 0.24163179916317992
Negative Labels: 0.694560669456067
Neutral Labels: 0.06380753138075314
--------------------
Microsoft January 2023
Positive Labels: 0.22121212121212122
Negative Labels: 0.7070707070707071
Neutral Labels: 0.07171717171717172
--------------------
Microsoft February 2023
Positive Labels: 0.26639757820383453
Negative Labels: 0.693239152371342
Neutral Labels: 0.04036326942482341


# Naive Bayes Classification - Calculate Accuracy of Model

In [13]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#Test on Testing Dataset
test_df = pd.read_csv('Dataset/Comparing Accuracy/microsoft_goldtruth_cleaned.csv', encoding='latin-1')
for index,row in test_df.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df.at[index,'predicted sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df.to_csv("Dataset/Comparing Accuracy/Microsoft_NBClassifier_CompareAccuracy.csv")

# Calculate Accuracy of Naive Bayes Classification Model

In [14]:
#Load dataset
df = pd.read_csv('Dataset/Comparing Accuracy/Microsoft_NBClassifier_CompareAccuracy.csv', encoding='latin-1')

#Calculate Accuracy
microsoft_accuracycount = 0
for i in range(len(df)):
    if df['goldtruth'][i] == df['predicted sentiment'][i]:
        microsoft_accuracycount += 1
        
print(microsoft_accuracycount)

37


# TP/TN/FP/FN Calculation - Microsoft

In [None]:
#33 negative
#36 neutral 
#31 positive in goldtruth of microsoft

In [15]:
## for the 'positive' class, count number of TP, TN, FP and Fn

tp_microsoft_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'pos' and df['goldtruth'][i] == 'pos':
        tp_microsoft_pos += 1
        
        
tn_microsoft_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'pos' and df['goldtruth'][i] != 'pos':
        tn_microsoft_pos += 1
        
fp_microsoft_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'pos' and df['goldtruth'][i] != 'pos':
        fp_microsoft_pos += 1
        
fn_microsoft_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'pos' and df['goldtruth'][i] == 'pos':
        fn_microsoft_pos += 1
        
print(tp_microsoft_pos)
print(tn_microsoft_pos)
print(fp_microsoft_pos)
print(fn_microsoft_pos)

9
55
14
22


In [16]:
## for the 'negative' class, count number of TP, TN, FP and Fn

tp_microsoft_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neg' and df['goldtruth'][i] == 'neg':
        tp_microsoft_neg += 1
        
        
tn_microsoft_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neg' and df['goldtruth'][i] != 'neg':
        tn_microsoft_neg += 1
        
fp_microsoft_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neg' and df['goldtruth'][i] != 'neg':
        fp_microsoft_neg += 1
        
fn_microsoft_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neg' and df['goldtruth'][i] == 'neg':
        fn_microsoft_neg += 1
        
print(tp_microsoft_neg)
print(tn_microsoft_neg)
print(fp_microsoft_neg)
print(fn_microsoft_neg)

26
21
46
7


In [17]:
## for the 'neutral' class, count number of TP, TN, FP and Fn

tp_microsoft_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neu' and df['goldtruth'][i] == 'neu':
        tp_microsoft_neu += 1
        
        
tn_microsoft_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neu' and df['goldtruth'][i] != 'neu':
        tn_microsoft_neu += 1
        
fp_microsoft_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neu' and df['goldtruth'][i] != 'neu':
        fp_microsoft_neu += 1
        
fn_microsoft_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neu' and df['goldtruth'][i] == 'neu':
        fn_microsoft_neu += 1
        
print(tp_microsoft_neu)
print(tn_microsoft_neu)
print(fp_microsoft_neu)
print(fn_microsoft_neu)

2
61
3
34


# ORACLE

In [18]:
#Load Dataset
Oracle_df = pd.read_csv('Dataset/Companies/oracle.csv')
Oracle_df.head()

Unnamed: 0,Datetime,Text,Username
0,Dec-2022,"”Drywall's ""Work The Dumb Oracle"" album's warm...",Stan_Ridgway
1,Dec-2022,@ORACLE_ECHO @snakeeyes828 @CPD1617Scanner Wha...,GrizzledTexan
2,Dec-2022,"My copy of FANTASTIC FRIGHTS arrived, marking ...",lomakescomics
3,Dec-2022,"Look at any university in the US, they have cl...",NullRSJ
4,Dec-2022,"@MsJoyceTarot Ms Joyce, I’ve followed your wor...",KLVNKBRWN


In [19]:
Oracle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2842 entries, 0 to 2841
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2842 non-null   object
 1   Text      2842 non-null   object
 2   Username  2842 non-null   object
dtypes: object(3)
memory usage: 66.7+ KB


In [20]:
#Drop Duplicate Rows
Oracle_df = Oracle_df.drop_duplicates()
Oracle_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2780 entries, 0 to 2841
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Datetime  2780 non-null   object
 1   Text      2780 non-null   object
 2   Username  2780 non-null   object
dtypes: object(3)
memory usage: 86.9+ KB


In [21]:
#Filter Data by Different Time Periods
Oracle_Dec22 = Oracle_df[Oracle_df['Datetime'] == 'Dec-2022']
Oracle_Jan23 = Oracle_df[Oracle_df['Datetime'] == 'Jan-2023']
Oracle_Feb23 = Oracle_df[Oracle_df['Datetime'] == 'Feb-2023']

# Tweets PreProcessing

In [22]:
#Remove URLs
Oracle_Dec22 = Oracle_Dec22['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Oracle_Jan23 = Oracle_Jan23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
Oracle_Feb23 = Oracle_Feb23['Text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

In [23]:
#Remove HTML Reference Characters
Oracle_Dec22 = Oracle_Dec22.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Oracle_Jan23 = Oracle_Jan23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
Oracle_Feb23 = Oracle_Feb23.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [24]:
#Remove Twitter Handles
Oracle_Dec22 = Oracle_Dec22.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Oracle_Jan23 = Oracle_Jan23.apply(lambda x: re.sub(r'@[^ ]+', '', x))
Oracle_Feb23 = Oracle_Feb23.apply(lambda x: re.sub(r'@[^ ]+', '', x))

In [25]:
#Replace Emojis with Words
with open('Libraries/Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

Oracle_Dec22 = Oracle_Dec22.apply(convert_emojis_to_word)
Oracle_Jan23 = Oracle_Jan23.apply(convert_emojis_to_word)
Oracle_Feb23 = Oracle_Feb23.apply(convert_emojis_to_word)

In [26]:
#Replace EMOTICONS with Words
with open('Libraries/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons(text):
    for emot in Emoticon_Dict:
        text = re.sub(u'('+emot+')', "_".join(Emoticon_Dict[emot].replace(",","").split()), text)
    return text

Oracle_Dec22 = Oracle_Dec22.apply(convert_emoticons)
Oracle_Jan23 = Oracle_Jan23.apply(convert_emoticons)
Oracle_Feb23 = Oracle_Feb23.apply(convert_emoticons)

# Perform Naive Bayes Classification

In [27]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#For Oracle December 2022
test_df1 = Oracle_Dec22.to_frame()
for index,row in test_df1.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df1.at[index,'sentiment'] = predicted_label
    
#For Oracle January 2023
test_df2 = Oracle_Jan23.to_frame()
for index,row in test_df2.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df2.at[index,'sentiment'] = predicted_label
    
#For Oracle Februrary 2023
test_df3 = Oracle_Feb23.to_frame()
for index,row in test_df3.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df3.at[index,'sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df1.to_csv("Dataset/Sentiments Predicted/OracleDec22_Sentiments.csv")
test_df2.to_csv("Dataset/Sentiments Predicted/OracleJan23_Sentiments.csv")
test_df3.to_csv("Dataset/Sentiments Predicted/OracleFeb23_Sentiments.csv")

# Analysis of Output

In [28]:
#Load Dataset
Oracle_Dec22 = pd.read_csv("Dataset/Sentiments Predicted/OracleDec22_Sentiments.csv", encoding='latin-1')
Oracle_Jan23 = pd.read_csv("Dataset/Sentiments Predicted/OracleJan23_Sentiments.csv", encoding='latin-1')
Oracle_Feb23 = pd.read_csv("Dataset/Sentiments Predicted/OracleFeb23_Sentiments.csv", encoding='latin-1')

#Used to Store the Labels & Total Number of Tweets
labels_Oracle_Dec22 = []
count_Oracle_Dec22 = 0
labels_Oracle_Jan23 = []
count_Oracle_Jan23 = 0
labels_Oracle_Feb23 = []
count_Oracle_Feb23 = 0

#For Oracle December 2022
for index,row in Oracle_Dec22.iterrows():
    labels_Oracle_Dec22.append(row['sentiment'])
    count_Oracle_Dec22 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Oracle_Dec22.count('pos')
num_negatives = labels_Oracle_Dec22.count('neg')
num_neutral = labels_Oracle_Dec22.count('neu')

#Print the Results
print('Oracle December 2022')
print('Positive Labels:', num_positives/count_Oracle_Dec22)
print('Negative Labels:', num_negatives/count_Oracle_Dec22)
print('Neutral Labels:', num_neutral/count_Oracle_Dec22)


#For Oracle January 2023
for index,row in Oracle_Jan23.iterrows():
    labels_Oracle_Jan23.append(row['sentiment'])
    count_Oracle_Jan23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Oracle_Jan23.count('pos')
num_negatives = labels_Oracle_Jan23.count('neg')
num_neutral = labels_Oracle_Jan23.count('neu')

#Print the Results
print('--------------------')
print('Oracle January 2023')
print('Positive Labels:', num_positives/count_Oracle_Jan23)
print('Negative Labels:', num_negatives/count_Oracle_Jan23)
print('Neutral Labels:', num_neutral/count_Oracle_Jan23)

#For Oracle February 2023
for index,row in Oracle_Feb23.iterrows():
    labels_Oracle_Feb23.append(row['sentiment'])
    count_Oracle_Feb23 += 1

#Count the Number of Positive, Negative and Neutral Labels
num_positives = labels_Oracle_Feb23.count('pos')
num_negatives = labels_Oracle_Feb23.count('neg')
num_neutral = labels_Oracle_Feb23.count('neu')

#Print the Results
print('--------------------')
print('Oracle February 2023')
print('Positive Labels:', num_positives/count_Oracle_Feb23)
print('Negative Labels:', num_negatives/count_Oracle_Feb23)
print('Neutral Labels:', num_neutral/count_Oracle_Feb23)

Oracle December 2022
Positive Labels: 0.4957983193277311
Negative Labels: 0.35084033613445376
Neutral Labels: 0.15336134453781514
--------------------
Oracle January 2023
Positive Labels: 0.5264976958525346
Negative Labels: 0.3352534562211982
Neutral Labels: 0.1382488479262673
--------------------
Oracle February 2023
Positive Labels: 0.48541666666666666
Negative Labels: 0.371875
Neutral Labels: 0.14270833333333333


# Naive Bayes Classification - Calculate Accuracy of Model

In [29]:
#Load Training Dataset
training_dataset = pd.read_csv('Dataset/Training/training_dataset.csv', encoding='latin-1')

#Create Training Data
train_text = []
for index,row in training_dataset.iterrows():
    train_text.append((row['Text'], row['Label']))

#Feature Extraction Function
def extract_features(text):
    words = word_tokenize(text)
    return dict((word, True) for word in words)

#Create Feature Sets from Training Data
train_features = [(extract_features(text), label) for (text, label) in train_text]

#Train the Classifier
classifier = NaiveBayesClassifier.train(train_features)

#Test on Testing Dataset
test_df = pd.read_csv('Dataset/Comparing Accuracy/oracle_goldtruth_cleaned.csv', encoding='latin-1')
for index,row in test_df.iterrows():
    test_features = extract_features(row['Text'])
    predicted_label = classifier.classify(test_features)
    test_df.at[index,'predicted sentiment'] = predicted_label

#Export Sentiments Predicted Dataset
test_df.to_csv("Dataset/Comparing Accuracy/Oracle_NBClassifier_CompareAccuracy.csv")

# Calculate Accuracy of Naive Bayes Classification Model

In [30]:
#Load dataset
df = pd.read_csv('Dataset/Comparing Accuracy/Oracle_NBClassifier_CompareAccuracy.csv', encoding='latin-1')

#Calculate Accuracy
oracle_accuracycount = 0
for i in range(len(df)):
    if df['goldtruth'][i] == df['predicted sentiment'][i]:
        oracle_accuracycount += 1
        
print(oracle_accuracycount)

49


# Calculate Overall Accuracy of Naive Bayes Model

In [31]:
overall_accuracy = (oracle_accuracycount + microsoft_accuracycount) / 200
print(overall_accuracy)

0.43


# TP/TN/FP/FN Calculation - Oracle

In [None]:
#28 negative
#43 neutral
#29 positive 
# true labels in oracle goldtruth 

# adding up total labels across microsoft/oracle 
# negative : 28 + 33 =61

# neutral : 43 + 36 = 79

# positive: 29+ 31 = 60

# not balanced dataset so thats why need weighted. 

In [32]:
## for the 'positive' class, count number of TP, TN, FP and Fn

tp_oracle_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'pos' and df['goldtruth'][i] == 'pos':
        tp_oracle_pos += 1
        
        
tn_oracle_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'pos' and df['goldtruth'][i] != 'pos':
        tn_oracle_pos += 1
        
fp_oracle_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'pos' and df['goldtruth'][i] != 'pos':
        fp_oracle_pos += 1
        
fn_oracle_pos = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'pos' and df['goldtruth'][i] == 'pos':
        fn_oracle_pos += 1
        
print(tp_oracle_pos)
print(tn_oracle_pos)
print(fp_oracle_pos)
print(fn_oracle_pos)

18
45
26
11


In [33]:
## for the 'negative' class, count number of TP, TN, FP and Fn

tp_oracle_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neg' and df['goldtruth'][i] == 'neg':
        tp_oracle_neg += 1
        
        
tn_oracle_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neg' and df['goldtruth'][i] != 'neg':
        tn_oracle_neg += 1
        
fp_oracle_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neg' and df['goldtruth'][i] != 'neg':
        fp_oracle_neg += 1
        
fn_oracle_neg = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neg' and df['goldtruth'][i] == 'neg':
        fn_oracle_neg += 1
        
print(tp_oracle_neg)
print(tn_oracle_neg)
print(fp_oracle_neg)
print(fn_oracle_neg)

24
47
25
4


In [34]:
## for the 'neutral' class, count number of TP, TN, FP and Fn

tp_oracle_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neu' and df['goldtruth'][i] == 'neu':
        tp_oracle_neu += 1
        
        
tn_oracle_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neu' and df['goldtruth'][i] != 'neu':
        tn_oracle_neu += 1
        
fp_oracle_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] == 'neu' and df['goldtruth'][i] != 'neu':
        fp_oracle_neu += 1
        
fn_oracle_neu = 0
for i in range(len(df)):
    if df['predicted sentiment'][i] != 'neu' and df['goldtruth'][i] == 'neu':
        fn_oracle_neu += 1
        
print(tp_oracle_neu)
print(tn_oracle_neu)
print(fp_oracle_neu)
print(fn_oracle_neu)

7
57
0
36


# Weighted Precision/Recall/F1 Score Computation - Microsoft & Oracle

In [35]:
# positive class
tp_pos = tp_oracle_pos + tp_microsoft_neg
tn_pos = tn_oracle_pos + tn_microsoft_neg
fp_pos = fp_oracle_pos + fp_microsoft_neg
fn_pos = fn_oracle_pos + fn_microsoft_neg

#negative class

tp_neg = tp_oracle_neg + tp_microsoft_neg
tn_neg = tn_oracle_neg + tn_microsoft_neg
fp_neg = fp_oracle_neg + fp_microsoft_neg
fn_neg = fn_oracle_neg + fn_microsoft_neg

#neutral class

tp_neu = tp_oracle_neu + tp_microsoft_neu
tn_neu = tn_oracle_neu + tn_microsoft_neu
fp_neu = fp_oracle_neu + fp_microsoft_neu
fn_neu = fn_oracle_neu + fn_microsoft_neu

In [36]:
# positive class

precision_pos = tp_pos / (tp_pos + fp_pos)
recall_pos = tp_pos / (tp_pos + fn_pos)

# negative class

precision_neg = tp_neg / (tp_neg + fp_neg)
recall_neg = tp_neg / (tp_neg + fn_neg)

# neutral class

precision_neu = tp_neu / (tp_neu + fp_neu)
recall_neu = tp_neu / (tp_neu + fn_neu)

In [37]:
#weights for each class since we did not have a labelled dataset 

w_Positive = 200 / (3 * 60)

w_Negative = 200 / (3 * 61)

w_Neutral = 200 / (3 * 79)

In [38]:
Macroaveraged_Precision = (w_Positive*precision_pos + w_Negative*precision_neg + w_Neutral * precision_neu) / 3

Macroaveraged_Recall = (w_Positive*recall_pos + w_Negative*recall_neg + w_Neutral*recall_neu) / 3

f1 = 2 * ( ( (Macroaveraged_Precision) * (Macroaveraged_Recall) ) / ( (Macroaveraged_Precision) + (Macroaveraged_Recall) ) )

In [39]:
print('Macroaveraged_Precision:',Macroaveraged_Precision)
print('Macroaveraged_Recall:',Macroaveraged_Recall)
print('F1 Score:', f1)

Macroaveraged_Precision: 0.5019924402384506
Macroaveraged_Recall: 0.5934951473585859
F1 Score: 0.5439223240232746
