<a href="https://colab.research.google.com/github/sanghaimuskan/Text-Analysis/blob/master/Text_Analysis_with_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [20]:
all_positive_tweet = twitter_samples.strings('positive_tweets.json')
all_negative_tweet = twitter_samples.strings('negative_tweets.json')

test_pos = all_positive_tweet[4000:]
train_pos = all_positive_tweet[:4000]
test_neg = all_negative_tweet[4000:]
train_neg = all_negative_tweet[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))



In [5]:
def process_tweet(tweet):
  import re  #for regular expression operations
  import string     #for string operations

  from nltk.corpus import stopwords
  from nltk.stem import PorterStemmer #for stemming
  from nltk.tokenize import TweetTokenizer # for tokenizing

  process_tweet = re.sub(r'^RT[/s]','',tweet) #removes retweet
  process_tweet = re.sub(r'https?:\/\/.*[/r]*','',process_tweet)  #removes url and links
  process_tweet = re.sub(r'#','',process_tweet)   #removes #

  tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles=True)
  tokens = tokenizer.tokenize(process_tweet)

  stopwords_english = stopwords.words('english')

  stop_pun_tweet = []

  for i in tokens:
    if(i not in stopwords_english and i not in string.punctuation):
      stop_pun_tweet.append(i)

  stem_tweets = []
  stemmer = PorterStemmer()

  for i in stop_pun_tweet :
    stem_word = stemmer.stem(i)
    stem_tweets.append(i)
  
  return stem_tweets

In [6]:
def build_freqs(tweets, ys):
  yslist = np.squeeze(ys).tolist()

  freqs={}

  for y, tweet in zip(yslist, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair]+=1
      else:
        freqs[pair]=1
  return freqs

In [7]:
def count_tweets(result, tweets, ys): # shows count of each word in each class 
  for y, tweet in zip(ys, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)

      if pair in result:
        result[pair]+=1
      else:
        result[pair]=1
  return result

## **Training model using Naive Bayes **


In [8]:
freqs = count_tweets({}, train_x, train_y)
print(freqs)



In [9]:
def test_lookup(func):
    freqs = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if func(freqs, word, label) == 12:
        return 'SUCCESS!!'
    return 'Failed Sanity Check!'


def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [10]:
def train_naive_bayes(freqs, train_x, train_y):
  loglikelihood={}
  logpriori=0

  vocab = set([pair[0] for pair in freqs.keys()])
  v = len(vocab)

  N_pos = N_neg = 0
  for pair in freqs.keys():
    if pair[1]==1.0 :
      N_pos += freqs[pair]
    else:
      N_neg += freqs[pair]
  
  D = len(train_y)
  D_pos = len(list(filter(lambda x: x>0 , train_y)))
  D_neg = len(list(filter(lambda x: x<=0, train_y)))

  logpriori = np.log(D_pos) - np.log(D_neg)

  for word in vocab:
    freq_pos = lookup(freqs, word, 1)
    freq_neg = lookup(freqs, word, 0)

    p_w_pos = (freq_pos + 1)/ (N_pos + v)
    p_w_neg = (freq_neg + 1)/ (N_neg + v)

    loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

  return logpriori, loglikelihood
  

In [11]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

In [12]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
   
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    p += logprior

    for word in word_l:

        if word in loglikelihood:
            
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p

In [13]:
my_tweet = 'She is perfect.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
if p>0 :
  print("The tweet is positive")
elif p<0 :
  print("The tweet is negative")
else:
  print("The tweet is neutal")

The tweet is positive


### ** Testing Naive Bayes model**

In [18]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    
    accuracy = 0  
    y_hats = []

    for tweet in test_x:
       
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            
            y_hats.append(1)
        else:
           
            y_hats.append(0)

    error = np.mean(np.absolute(y_hats - test_y))

    accuracy = 1 - error


    return accuracy


In [21]:
print('accuracy of Naive Bayes is ', test_naive_bayes(test_x, test_y, logprior, loglikelihood)*100,'%')

accuracy of Naive Bayes is  99.45 %
