In [1]:
import pandas as pd
import numpy as np
import math
import time
import random
import re
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [2]:
df = pd.read_csv('Corona_train.csv')
print(df)

          ID Sentiment                                        CoronaTweet
0      22979  Positive  I see all kinds of academics already whipping ...
1       9880  Negative  @HenrySmithUK can you raise with Boris please ...
2      35761  Negative  It s a confusing odd time for the shopping pub...
3      37968  Positive  Blog Summary: The Impact of COVID-19 on the Ca...
4      19709   Neutral  ??????? ??????? ???\r\r\nWaiting in a long Que...
...      ...       ...                                                ...
37859  20253   Neutral  DOUBLE TAP ??IF YOU REMEMBER BEING EMPLOYED?\r...
37860  38926  Positive  Struggling to understand why supermarkets or f...
37861  35889  Positive  For anyone interested in the checklist on cons...
37862  40641  Positive  Do you agree with the hypothesis that Indian s...
37863  26063   Neutral  In Richmond, local boutiques and thrift stores...

[37864 rows x 3 columns]


In [6]:
def stopword_stemming(tweet_list):
    stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
    tweet_list_stop = [w for w in tweet_list if not w.lower() in stopwords]
    for i in range(0, len(tweet_list_stop)):
        if(tweet_list_stop[i].endswith("ing")):
            str_name = tweet_list_stop[i]
            tweet_list_stop[i] = str_name[ :len(str_name)-len("ing") ]
        if(tweet_list_stop[i].endswith("ed")):
            str_name = tweet_list_stop[i]
            tweet_list_stop[i] = str_name[ :len(str_name)-len("ed") ]

    return tweet_list_stop

def vocabulary(x, mode):
    if mode==0:
        vocab = {}
        for tweet in x:
            tweet_list = list()
            tweet_list = re.split('\r|\n|\s|\t|\!|\?|\.|\#|\,|\:', tweet.lower())
            tweet_list_stop = stopword_stemming(tweet_list)
            for words in tweet_list_stop:
                if(words != "" and words in vocab.keys()):
                    vocab.update({words:vocab[words]+1})
                elif(words != ""):
                    vocab[words] = 1
                else:
                    continue
            tweet_bigram = list()
            for i in range(0, len(tweet_list_stop)-1):
                tweet_bigram.append(tweet_list_stop[i]+" "+tweet_list_stop[i+1])
            for words in tweet_bigram:
                if(words != "" and words in vocab.keys()):
                    vocab.update({words:vocab[words]+1})
                elif(words != ""):
                    vocab[words] = 1
                else:
                    continue

    return vocab

vocab_pos = vocabulary(df.loc[df['Sentiment'] == 'Positive']['CoronaTweet'], 0)
vocab_neg = vocabulary(df.loc[df['Sentiment'] == 'Negative']['CoronaTweet'], 0)
vocab_neu = vocabulary(df.loc[df['Sentiment'] == 'Neutral']['CoronaTweet'], 0)
v = vocabulary(df['CoronaTweet'], 0)
print(vocab_pos)



In [7]:
no_of_words = {}

for label in ['Positive', 'Negative', 'Neutral']:
    df_label = df.loc[df['Sentiment'] == label]['CoronaTweet']
    c = 0
    for tweet in df_label:
        lst = []
        lst = stopword_stemming(re.split('\r|\n|\s|\t|\!|\?|\.|\#|\,|\:', tweet))
        if '' in lst: lst.remove('')
        c += (2*len(lst) -1)
    no_of_words[label] = c
    
print(no_of_words)

{'Positive': 929138, 'Negative': 763622, 'Neutral': 329462}


In [8]:
def log_prob_priori(y):
    positive = math.log(y.loc[y == 'Positive'].shape[0]/y.shape[0])
    negative = math.log(y.loc[y == 'Negative'].shape[0]/y.shape[0])
    neutral = math.log(y.loc[y == 'Neutral'].shape[0]/y.shape[0])
    return [positive, negative, neutral]

def log_prob_posteriori(test_tweet, label, alpha, df):
    df_label = df.loc[df['Sentiment'] == label]
    vocab = {}
    if(label == 'Positive'):
        vocab = vocab_pos
    elif(label == 'Negative'):
        vocab = vocab_neg
    else:
        vocab = vocab_neu
    final_sum = 0

    lst = stopword_stemming(re.split('\r|\n|\s|\t|\!|\?|\.|\#|\,|\:', test_tweet))
    if '' in lst: lst.remove('')
    test_tweet_bigram = list()
    for i in range(0, len(lst)-1):
        test_tweet_bigram.append(lst[i]+" "+lst[i+1])
    
    for word in test_tweet_bigram:
        if(word != '' and word in vocab.keys()):
            sum_label = alpha + vocab[word]
        elif(word != ''):
            sum_label = alpha
        else:
            continue
        #final_sum += math.log(sum_label/(df_label.shape[0] + alpha*3))
        final_sum += math.log(sum_label/(no_of_words[label] + len(v)))
    return final_sum

def NaiveBayes(df, df_test, alpha):
    #df_test = pd.read_csv('Corona_validation.csv')
    labels = ['Positive', 'Negative', 'Neutral']
    predicted_label = list()
    log_priori = log_prob_priori(df['Sentiment'])
    prob_y = list()
    for tweet in df_test['CoronaTweet']:
        max_predict = list()
        for label in labels:
            max_predict.append(math.exp(log_prob_posteriori(tweet.lower(), label, alpha, df) + log_priori[labels.index(label)]))
        prob_y.append(max_predict)
        predicted_label.append(labels[max_predict.index(max(max_predict))])
    return prob_y, predicted_label

df_test = pd.read_csv('Corona_validation.csv')
prob_y_given_x_train, prediction_train = NaiveBayes(df, df, 1)
prob_y_given_x_validation, prediction_validation = NaiveBayes(df, df_test, 1)

In [9]:
def accuracy(prediction, real):
    correct = 0
    for i in range(0, len(prediction)):
        if(prediction[i] == real[i]):
            correct += 1
    return correct/len(prediction)

acc_train = accuracy(prediction_train, df['Sentiment'].values.tolist())
acc_validation = accuracy(prediction_validation, df_test['Sentiment'].values.tolist())
print(acc_train, acc_validation)

0.9771550813437566 0.519890677194048
