# Natural Language Processing with classification and vector spaces

# Sentiment Analysis using Naive Bayes

Import Libraries and functions

In [None]:
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords

import numpy as np

In [None]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Get the sets of positive and negative tweets

In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

Get some raw tweets

In [None]:
np.random.seed(1)
rand = np.random.randint(0, 5000, 5)

In [None]:
print('Positive Tweets ::')
print('\033[92m')
for i in rand:
  print(all_positive_tweets[i])

Positive Tweets ::
[92m
@ZarlashtFaisal @Tabinda_Samar Sethi was HIGH ??? :)
Fav if awake fam :)
Just smile even your in Pain :) http://t.co/AxTiqf0xek
camillus pleaseee? :)
Why have i just painted my nails pink :) ???


In [None]:
print('Negative Tweets ::')
print('\033[91m')
for i in rand:
  print(all_negative_tweets[i])

Negative Tweets ::
[91m
I need a big cuddle from Lew and kisses on my face :(((( I don't want to go through this again
@kaiality too late now :(
traffic :-(
Soft defence by the best defensive team there :( #NRLTigersRoosters
@LukeBryanOnline Yayyyy!!! I hope it's not while I am knocked out by anesthesia. I will be so sad if I miss it :(


##Implementing the helper functions

**Process tweet**

In [None]:
import re
import string

from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def process_tweet(tweet):
  stemmer = PorterStemmer() 
  stopwords_english = stopwords.words('english')

  # remove the stock market tickers
  tweet = re.sub(r'\$\w*', '', tweet)

  # remove the old styles retweet text 'RT'
  tweet = re.sub(r'^RT[\s]+', '', tweet)

  # remove the hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # remove the # symbol
  tweet = re.sub(r'#', '', tweet)

  # Tokenize the tweet
  tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweet_clean = []

  # removing stopwords and punctuation
  for word in tweet_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
      stem_word = stemmer.stem(word)    #stemming
      tweet_clean.append(stem_word)

  return tweet_clean

Let's see how process_tweet performs

In [None]:
tweet = all_positive_tweets[np.random.randint(0, 5000)]
print('Raw tweet : \n', tweet)
tweet = process_tweet(tweet)
print('After processing the tweet : \n', tweet)

Raw tweet : 
 @WforWoman 5. Over 20 W kurtas! And my Mom has about half the number I have :D #WSaleLove
After processing the tweet : 
 ['5', '20', 'w', 'kurta', 'mom', 'half', 'number', ':D', 'wsalelov']


**Count tweets**

In [None]:
def count_tweets(tweets, ys):
  ys_list = np.squeeze(ys).tolist()
  freqs ={}

  for y, tweet in zip(ys_list, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] +=1
      else:
        freqs[pair] = 1
  
  return freqs

**Lookup**
 

In [None]:
# A function to return the frequency of positive and negative frequencies for the specific words
def lookup(freqs, word, label):
  n = 0
  pair = (word, label)
  if pair in freqs:
    n = freqs[pair]
  return n 

## Prepare the data

In [None]:
# splitting the data for training and testing 
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]

train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# numpy array for the labels in the training set
train_y = np.append(np.ones((len(train_pos))), np.zeros((len(train_neg))))
test_y = np.append(np.ones((len(test_neg))), np.zeros((len(test_neg))))

## Train the model using Naive Bayes

In [None]:
# Build a frequency dictionary
freqs = count_tweets(train_x, train_y)

In [None]:
def train_naive_bayes(freqs, train_x, train_y):
  logliklihood = {}
  logprior = 0

  # calculate V, number of unique words in the vocabulary
  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  ## Calculate N_pos, N_neg, V_pos, V_neg
  # N_pos : total number of positive words
  # N_neg : total number of negative words
  # V_pos : total number of unique positive words
  # V_neg : total number of unique negative words

  N_pos = N_neg = V_pos = V_neg = 0
  for pair in freqs.keys():
    if pair[1]>0:
      V_pos +=1
      N_pos += freqs[pair]
    else:
      V_neg +=1
      N_neg += freqs[pair]

  # Number of Documents (tweets)
  D = len(train_y)

  # D_pos, number of positive documnets
  D_pos = len(list(filter(lambda x: x>0, train_y)))

  # D_pos, number of negative documnets
  D_neg = len(list(filter(lambda x: x<=0, train_y)))

  # calculate the logprior
  logprior = np.log(D_pos) - np.log(D_neg)

  for word in vocab:
    freqs_pos = lookup(freqs, word, 1)
    freqs_neg = lookup(freqs, word, 0)

    # calculte the probability of each word being positive and negative
    p_w_pos = (freqs_pos+1)/(N_pos+V)
    p_w_neg = (freqs_neg+1)/(N_neg+V)

    logliklihood[word] = np.log(p_w_pos/p_w_neg)
  
  return logprior, logliklihood

In [None]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9089


## Test the model naive bayes

In [None]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
  word_l = process_tweet(tweet)
  p = 0
  p+=logprior

  for word in word_l:
    if word in loglikelihood:
      p+=loglikelihood[word]

  return p

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
  accuracy = 0
  y_hats = []
  for tweet in test_x:
    if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
      y_hat_i = 1
    else:
      y_hat_i = 0
    y_hats.append(y_hat_i)
  error = np.mean(np.absolute(test_y - y_hats))
  accuracy = 1-error

  return accuracy

In [None]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


## Check the model on our own tweets 

In [None]:
tweets = ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great',
          'great great great', 'great great great great', 'I am not happy :(']
for tweet in tweets:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    if p>1:
      print('\033[92m')
      print(f'{tweet} :: Positive sentiment ({p:.2f})')
    else:
      print('\033[91m')
      print(f'{tweet} :: Negative Sentiment ({p:.2f})')

[92m
I am happy :: Positive sentiment (2.15)
[91m
I am bad :: Negative Sentiment (-1.29)
[92m
this movie should have been great. :: Positive sentiment (2.14)
[92m
great :: Positive sentiment (2.14)
[92m
great great :: Positive sentiment (4.28)
[92m
great great great :: Positive sentiment (6.41)
[92m
great great great great :: Positive sentiment (8.55)
[91m
I am not happy :( :: Negative Sentiment (-5.36)
