In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pkg_resources
import nltk
import math
from nltk.probability import FreqDist
from nltk.util import ngrams
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
import re

In [2]:
#checking the list of installed libraries
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(["%s==%s" % (i.key, i.version)
   for i in installed_packages])
print(installed_packages_list)

['absl-py==0.10.0', 'argon2-cffi==20.1.0', 'asn1crypto==0.24.0', 'astunparse==1.6.3', 'async-generator==1.10', 'attrs==20.3.0', 'babel==2.7.0', 'backcall==0.1.0', 'bleach==3.1.0', 'blinker==1.4', 'blis==0.4.1', 'boto3==1.10.44', 'botocore==1.13.44', 'bottleneck==1.2.1', 'cachetools==3.1.1', 'catalogue==1.0.0', 'certifi==2020.6.20', 'cffi==1.14.3', 'chardet==3.0.4', 'click==7.1.2', 'cryptography==2.7', 'cycler==0.10.0', 'cymem==2.0.3', 'cython==0.29.21', 'datasets==1.8.0', 'decorator==4.4.2', 'defusedxml==0.6.0', 'dill==0.3.1.dev0', 'docutils==0.15.2', 'en-core-web-sm==2.3.1', 'en-vectors-web-lg==2.3.0', 'entrypoints==0.3', 'filelock==3.0.12', 'flatbuffers==20210625202600', 'fsspec==2021.4.0', 'funcsigs==1.0.2', 'future==0.18.2', 'gast==0.3.3', 'gensim==3.8.3', 'gevent==1.3a2', 'google-api-core==1.14.2', 'google-auth-oauthlib==0.4.1', 'google-auth==1.6.3', 'google-cloud-core==1.0.3', 'google-cloud-storage==1.18.0', 'google-pasta==0.2.0', 'google-resumable-media==0.3.2', 'googleapis-comm

## Part A and C

In [3]:
"""
filename : The training/ testing data filename can be passed
Stopwords: Thie argument can be set to False/ True depending upon the users value
"""
def extracting_data(filename, Stopwords):
    mylines = [] 
    with open (filename, 'rt') as myfile:  # Open filename for reading
        for myline in myfile:              # For each line, read to a string,
            mylines.append(myline.split('\t',3))
    extracted_list = [l[1:3] for l in mylines] # slicing the text and label from the text data
    print("\033[1m Please find the extracted list below:")
    print(extracted_list[0])
    df = pd.DataFrame(extracted_list, columns = ['text', 'sentiment']) # Dataframe of sentiment and labels
    df['text'] = df['text'].str.lower() # converting into lowercase
    df['text'] = df['text'].str.replace(r'[^\w\s]+', '') # removing punctuations
    pruned_df = df[~(df.sentiment == 'NEUTRAL')] # pruning the neutral sentiment
    #The below lines help to remove the stopwords
    if Stopwords:
        removed_text = []
        for text in pruned_df['text']:
            words = [word for word in text.split() if word not in stopwords.words('english')]
            new_text = " ".join(words)
            removed_text.append(new_text)
        pruned_df['text'] = removed_text
    return pruned_df

In [22]:
#Training data is passed to the extracting_data fucntion to get the dataframe
training_data = extracting_data("PS1.1A_training_data.txt", False)
print("\033[1;32m The class distrbution for the training data is \n",training_data['sentiment'].value_counts())
training_data.head()

[1m Please find the extracted list below:
['This is definitely a must have if your state does not allow cell phone usage while driving.', 'POSITIVE']
[1;32m The class distrbution for the training data is 
 NEGATIVE    1282
POSITIVE    1077
Name: sentiment, dtype: int64


  df['text'] = df['text'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,text,sentiment
0,this is definitely a must have if your state d...,POSITIVE
1,its a great place and i highly recommend it,POSITIVE
3,i can tell you about having my phone and elect...,NEGATIVE
4,their steaks are 100 recommended,POSITIVE
5,i was billed for thousands of dollars he said ...,NEGATIVE


In [23]:
#Testing data is passed to the extracting_data fucntion to get the dataframe
testing_data = extracting_data("PS1.1A_test_data.txt", False)
print("\033[1;32m The class distrbution for the testing data is \n",training_data['sentiment'].value_counts())
testing_data.head()

[1m Please find the extracted list below:
['The reception through this headset is excellent.', 'POSITIVE']
[1;32m The class distrbution for the testing data is 
 NEGATIVE    1282
POSITIVE    1077
Name: sentiment, dtype: int64


  df['text'] = df['text'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,text,sentiment
0,the reception through this headset is excellent,POSITIVE
1,hands down my favorite italian restaurant,POSITIVE
2,the bathrooms are clean and the place itself i...,POSITIVE
3,if you havent gone here go now,POSITIVE
4,host staff were for lack of a better word bitches,NEGATIVE


## Part B

In [24]:
def train_naive_bayes_classifier(data, alpha):
    #positive and negative sentiment data are splitted
    positive_sentiment_data = data[data['sentiment'] == "POSITIVE"]
    negative_sentiment_data = data[data['sentiment'] == "NEGATIVE"]
    positive_count = len(positive_sentiment_data)
    negative_count = len(negative_sentiment_data)
    #calculating the prior probability of positive and negative sentiment
    prior_positive_proba = np.log(positive_count / (positive_count+negative_count))
    prior_negative_proba = np.log(negative_count / (positive_count+negative_count))
    #The data is converted into list
    combined_list = data.text.tolist()
    positive_list = data[data.sentiment == "POSITIVE"].text.tolist()
    negative_list = data[data.sentiment == "NEGATIVE"].text.tolist()
    # From list of list it is converted to list
    flat_combined_list = [item for item in combined_list]
    flat_positive_list = [item for item in positive_list]
    flat_negative_list = [item for item in negative_list]
    #from list it is converted to string
    combined_class = ' '.join(flat_combined_list)
    positive_class = ' '.join(flat_positive_list)
    negative_class = ' '.join(flat_negative_list)
    #The strings are tokenized
    combined_tokens = combined_class.split()
    positive_tokens = positive_class.split()
    negative_tokens = negative_class.split()
    #Unique set of vocabulary is created 
    vocabulary = set(combined_tokens)
    
    positive_dictionary = {}
    negative_dictionary = {}
    # Dictionary is created with words as the keys and their count as the values
    for word in positive_tokens:
        positive_dictionary[word] = positive_tokens.count(word)
    for word in negative_tokens:
        negative_dictionary[word] = negative_tokens.count(word)
    positive_liklihood = {}
    negative_liklihood = {}
    #The below loop is used to calculate the liklihoods of all the positive and negative sentiment for all the words in the vocabulary
    for word in vocabulary:
        pos_value = positive_dictionary.get(word)
        neg_value = negative_dictionary.get(word)
        if pos_value == None:
            pos_value = 0
        positive_liklihood[word] = np.log((pos_value+alpha)/(len(positive_tokens)+len(vocabulary)))
        if neg_value == None:
            neg_value = 0
        negative_liklihood[word] = np.log((neg_value+alpha)/(len(negative_tokens)+len(vocabulary)))
    
    return positive_liklihood, negative_liklihood, prior_positive_proba, prior_negative_proba, vocabulary

In [25]:
pos_lik, neg_lik, pos_prior, neg_prior, voc=train_naive_bayes_classifier(training_data,1)

6575 4265


In [26]:
#The below function is used for test the naive bayes
def test_naive_bayes(data,pos_lik, neg_lik, pos_prior, neg_prior, voc):
    predicted = []
    
    #The below loop traverses every row on the test data
    for doc in data['text']:
        result = np.zeros((2,1))
        text = nltk.word_tokenize(doc) #tokenized
        result[0] = pos_prior 
        result[1] = neg_prior
        # The below loop is used to calcualte the posterior probability of positive and negative senitment
        for word in text:
            if word in voc:
                result[0] = pos_lik.get(word) + result[0]
                result[1] = neg_lik.get(word) + result[1]
        # The greatest posterior probabiltiy is appended to the predicted variable
        if result[0] > result[1]:
            predicted.append("POSITIVE")
        else:
            predicted.append("NEGATIVE")
    #Calculating the metrics
    print(classification_report(data['sentiment'], predicted, target_names=["POSITIVE", "NEGATIVE"]))
    cf_matrix = confusion_matrix(data['sentiment'], predicted)
    print(cf_matrix)

In [27]:
#Calling the test_naive_Bayes function
test_naive_bayes(testing_data, pos_lik, neg_lik, pos_prior, neg_prior, voc)

              precision    recall  f1-score   support

    POSITIVE       0.75      0.80      0.78      1013
    NEGATIVE       0.79      0.73      0.76      1002

    accuracy                           0.77      2015
   macro avg       0.77      0.77      0.77      2015
weighted avg       0.77      0.77      0.77      2015

[[813 200]
 [269 733]]


## Part D

If we are **not removing** the stopwords from the text, the accuracy and F1 score are better compared to the text where the stopwords are **removed**. Why does this happen? When the stopwords have been removed the likelihood of the other words in the training data remains the same because the stopwords are independent of the other words in the training data. Now, during the testing, since there are no stopwords in the vocabulary the logarithmic addition will result in less posterior value compared to the posterior value when stopwords are present. This is happening because of stopwords appearing in the respective classes are removed.

## Part E

The fundamental drawback of the naive Bayes classification algorithm is that negation phrases like "doesn't" and "haven't" are ignored by the algorithm. To make our algorithm understand the difference between does and doesn't we can concatenate not with the doesn't (e.g.: don’t will be not_don’t), then our algorithm will take account of these words when we are calculating the likelihood, which will result in improving recall and precision.

## Bonus