# Import libraries

In [None]:
import re
import string
import math

In [None]:
#from utils import process_tweet, lookup
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd
from nltk.stem import PorterStemmer
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
#import w2_unittest

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Define process_tweet(tweet)

In [None]:
def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [None]:
def test_lookup(func):
    freqs = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if func(freqs, word, label) == 12:
        return 'SUCCESS!!'
    return 'Failed Sanity Check!'

In [None]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

# Load data and split into training and testing

In [None]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [None]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


# Define count_tweets(results, tweets, ys):
It will calculate the number of times each word appears in corpus

In [None]:
# UNQ_C1 GRADED FUNCTION: count_tweets

def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    ### START CODE HERE ###
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result

In [None]:
# Testing your function

res = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(res, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [None]:
result = {}
freqs = count_tweets(result, train_x, train_y)

#Define train_naive_bayes(freqs, train_x, train_y)

In [None]:
vocab = []
for tweet in train_x:
      for word in process_tweet(tweet):
        if word not in vocab:
          vocab.append(word)
        else:
          print
V = len(vocab)
print(vocab)

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)', 'hey', 'jame', 'odd', ':/', 'pleas', 'call', 'contact', 'centr', '02392441234', 'abl', 'assist', 'mani', 'thank', 'listen', 'last', 'night', 'bleed', 'amaz', 'track', 'scotland', 'congrat', 'yeaaah', 'yipppi', 'accnt', 'verifi', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profil', '15', 'day', 'one', 'irresist', 'flipkartfashionfriday', 'like', 'keep', 'love', 'custom', 'wait', 'long', 'hope', 'enjoy', 'happi', 'friday', 'lwwf', 'second', 'thought', '’', 'enough', 'time', 'dd', 'new', 'short', 'enter', 'system', 'sheep', 'must', 'buy', 'jgh', 'go', 'bayan', ':d', 'bye', 'act', 'mischiev', 'etl', 'layer', 'in-hous', 'wareh', 'app', 'katamari', 'well', '…', 'name', 'impli', ':p', 'influenc', 'big', '...', 'juici', 'selfi', 'follow', 'u', 'back', 'perfect', 'alreadi', 'know', "what'", 'great', 'opportun', 'junior', 'triathlet', 'age', '12', '13', 'gatorad', 'seri', 'get', 'entri', 'lay', 'greet', 'card', 'rang

In [None]:
# UNQ_C2 GRADED FUNCTION: train_naive_bayes

def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0
    vocab = []
    for tweet in train_x:
      for word in process_tweet(tweet):
        if word not in vocab:
          vocab.append(word)
        else:
          print
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
          # Increment the number of positive words by the count for this (word, label) pair
          N_pos += freqs[pair]

        # else, the label is negative
        else:
          # increment the number of negative words by the count for this (word,label) pair
          N_neg += freqs[pair]
    # For each word in the vocabulary...
    freq_pos = freq_neg = 0
    for word in vocab:
      # get the positive and negative frequency of the word
      if (word, 1.0) in freqs.keys():
        freq_pos = freqs[word, 1.0] 
    
      elif (word, 0.0) in freqs.keys():       
        freq_neg = freqs[word, 0.0]

        # calculate the probability that each word is positive, and negative
      p_w_pos = (freq_pos+1)/(N_pos+V)
      p_w_neg = (freq_neg+1)/(N_neg+V)
      
        # calculate the log likelihood of the word
      loglikelihood[word] = math.log(((p_w_pos)/(p_w_neg)),10)
    ### END CODE HERE ###

    return loglikelihood

#Predict

In [None]:
#make prediction
def predict(tweet):
  word_l = process_tweet(tweet)
  p = 0
  for word in word_l:
    if word in loglikelihood:
      p += loglikelihood[word]
  # if the loglikelihood of words is > one then it is a positive tweet
  if p > 1:
    print(f'The sentence is positive\n{p}')
  else: print(f'The sentence is negative\n{p}')

In [None]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDI
loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(len(loglikelihood))

9162


In [None]:
tweet = "idiot"
predict(tweet)

The sentence is negative
0.29618759622856994
