# Naïve Bayes Classification and Sentiment Analysis
Dartmouth College, LING48, Spring 2024<br>
Samuel Peter (samuel.peter.25[link text](https://)@dartmouth.edu)

In [1]:
# Import libraries
import itertools
import collections
from nltk import word_tokenize
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.metrics.scores import precision, recall, f_measure
from nltk.collocations import BigramCollocationFinder
import gdown

In [2]:
# Download the 'punkt' library for NLTK
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Download files
url = "https://drive.google.com/uc?id=1aQRJ5htEHZMmajz-HAcyJLVaYw_X-yOw"
output = 'hw4-nb-files.zip'
gdown.download(url, output, quiet=False)
!unzip -jo $output

Downloading...
From: https://drive.google.com/uc?id=1aQRJ5htEHZMmajz-HAcyJLVaYw_X-yOw
To: /content/hw4-nb-files.zip
100%|██████████| 430k/430k [00:00<00:00, 18.3MB/s]

Archive:  hw4-nb-files.zip
  inflating: amazon-neg.txt          
  inflating: amazon-pos.txt          
  inflating: google-neg.txt          
  inflating: google-pos.txt          
  inflating: mini-movie-reviews.txt  





In [4]:
# Function to construct a bag of words with both unigrams and bigrams
# https://streamhacker.com/2010/05/24/
# text-classification-sentiment-analysis-stopwords-collocations/
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):

  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, n)

  tupledWords = []
  for w in words:
    tempList = []
    tempList.append(w)
    tempTuple = tuple(tempList)
    tupledWords.append(tempTuple)

  return dict([(ngram, True) for ngram in itertools.chain(tupledWords, bigrams)])

In [5]:
#Method: runNBTest - method to run a Naive Bayes Sentiment Analysis
#Parameters:  filenamePos : The file that has the positive reviews
#             filenameNeg : The file that has the negative reviews
#             cutoff      : The percentage of reviews that should be used for the training set (e.g.0.8,0.7,0.9)
#             numFeats    : The number of "most informative features" that should be presented at the end of the analysis
def runNBTest(filenamePos, filenameNeg, cutoff, numFeats):
  # We will store the negative and positive reviews here
  posReviewsText = []
  negReviewsText = []

  # Open the file containing the positive reviews
  file = open(filenamePos, "r")
  fileLines = file.readlines()
  # Go through the file and put the text of the reviews in the correct list.
  for l in fileLines:
    tempLine = l.split("\n")
    posReviewsText.append(tempLine[0])

  # Open the file containing the negative reviews
  file = open(filenameNeg, "r")
  fileLines = file.readlines()
  # Go through the file and put the text of the reviews in the correct list.
  for l in fileLines:
    tempLine = l.split("\n")
    negReviewsText.append(tempLine[0])

  # This will contain the bag-of-words
  # for positive and negative reviews.
  negfeats = []
  posfeats = []

  # for every positive review:
  # (1) tokenize it, (2) extract the bag-of-words as
  # features, and (3) append it to the positive features.
  for f in posReviewsText:
    tokens = word_tokenize(f)
    wordFeats = bigram_word_feats(tokens)
    posfeats.append((wordFeats, 'pos'))

  # for every negative review:
  # (1) tokenize it, (2) extract the bag-of-words as
  # features, and (3) append it to the negative features.
  for f in negReviewsText:
    tokens = word_tokenize(f)
    wordFeats = bigram_word_feats(tokens)
    negfeats.append((wordFeats, 'neg'))

  # Get the number of elements that
  # will be in the training set.
  negcutoff = int(len(negfeats)*cutoff) # The number has to be an entire integer so that we can use it as an index
  poscutoff = int(len(posfeats)*cutoff)

  # Make the training and testing sets.
  trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
  testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
  print('train on ' + str(len(trainfeats)) + ' instances, test on ' + str(len(testfeats)) + ' instances')

  # Make a classifier based on the training features.
  classifier = NaiveBayesClassifier.train(trainfeats)

  # create two blank dictionaries that will contain
  # the goldLabels and the predictedLabels
  goldLabels = collections.defaultdict(set)
  predictedLabels = collections.defaultdict(set)

  # get the gold labels and the model predictions
  # for every item in the test set and put the
  # labels and the predictions in a Python dictionary
  for i, (feats, label) in enumerate(testfeats):
      # add the gold labels to the goldLabels dictionary
      goldLabels[label].add(i)
      # get the model's predictions (the "observed" labels)
      observed = classifier.classify(feats)
      # add the model predictions to the predictedLabels dictionary
      predictedLabels[observed].add(i)

  # Calculate the precision ,recall and
  # F for the positive and negative sets.

  posPrecision = precision(goldLabels['pos'], predictedLabels['pos'])
  posRecall    = recall(goldLabels['pos'], predictedLabels['pos'])
  negPrecision = precision(goldLabels['neg'], predictedLabels['neg'])
  negRecall    = recall(goldLabels['neg'], predictedLabels['neg'])
  negF         = f_measure(goldLabels['neg'], predictedLabels['neg'])
  posF         = f_measure(goldLabels['pos'], predictedLabels['pos'])


  # Print the accuracy, precisions, recalls and F values.
  print('accuracy:      ' + str(nltk.classify.util.accuracy(classifier, testfeats)))
  print('pos precision: ' + str(posPrecision))
  print('pos recall:    ' + str(posRecall))
  print('neg precision: ' + str(negPrecision))
  print('neg recall:    ' + str(negRecall) )
  print('neg F-measure: ' + str(negF))
  print('pos F-measure: ' + str(posF))

  # Print the most informative features.
  classifier.show_most_informative_features(n=numFeats)

In [6]:
print("=== AMAZON ===")
runNBTest("amazon-pos.txt", "amazon-neg.txt", 0.8, 25)

=== AMAZON ===
train on 800 instances, test on 200 instances
accuracy:      0.89
pos precision: 0.90625
pos recall:    0.87
neg precision: 0.875
neg recall:    0.91
neg F-measure: 0.892156862745098
pos F-measure: 0.8877551020408163
Most Informative Features
              ('Great',) = True              pos : neg    =     40.3 : 1.0
               ('nice',) = True              pos : neg    =     13.0 : 1.0
              ('smart',) = True              pos : neg    =     12.3 : 1.0
         ('people', ',') = True              pos : neg    =     11.7 : 1.0
              ('learn',) = True              pos : neg    =     11.0 : 1.0
      ('opportunities',) = True              pos : neg    =      9.8 : 1.0
           ('benefits',) = True              pos : neg    =      9.7 : 1.0
         ('to', 'learn') = True              pos : neg    =      9.0 : 1.0
            ('balance',) = True              neg : pos    =      8.8 : 1.0
                ('Not',) = True              neg : pos    =      7.

In [7]:
print("=== GOOGLE ===")
runNBTest("google-pos.txt", "google-neg.txt", 0.8, 25)

=== GOOGLE ===
train on 800 instances, test on 200 instances
accuracy:      0.885
pos precision: 0.9230769230769231
pos recall:    0.84
neg precision: 0.8532110091743119
neg recall:    0.93
neg F-measure: 0.8899521531100479
pos F-measure: 0.8795811518324608
Most Informative Features
              ('Great',) = True              pos : neg    =     29.8 : 1.0
              ('perks',) = True              pos : neg    =     25.4 : 1.0
               ('free',) = True              pos : neg    =     21.0 : 1.0
            ('amazing',) = True              pos : neg    =     17.7 : 1.0
          ('hard', 'to') = True              neg : pos    =     15.7 : 1.0
               ('Good',) = True              pos : neg    =     15.0 : 1.0
           ('can', 'be') = True              neg : pos    =     14.2 : 1.0
          ('sometimes',) = True              neg : pos    =     13.7 : 1.0
        ('interesting',) = True              pos : neg    =     13.0 : 1.0
           ('food', ',') = True          