In [1]:
import pandas as pd 
import numpy as np 
from utils import process_tweet,lookup
from nltk.corpus import stopwords, twitter_samples
import nltk
import string
from nltk.tokenize import TweetTokenizer



In [2]:
# get the set of all pos and neg tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

#split the data into two pieces. one for training and one for testing (validation set)
test_pos= all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append( np.ones(len(train_pos)) , np.zeros(len(train_neg)) )
test_y = np.append( np.ones(len(test_pos)) , np.zeros(len(test_neg)) )

# part 1 PROCESS THE DATA

In [11]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


## part 1.1 implementing your helper functions

#### Instructions
Create a function `count_tweets()` that takes a list of tweets as input, cleans all of them, and returns a dictionary.
- The key in the dictionary is a tuple containing the stemmed word and its class label, e.g. ("happi",1).
- The value the number of times this word appears in the given collection of tweets (an integer).

In [14]:
# UNQ_C1
def count_tweets(result,tweets,ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''

    for y,tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)

            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result


In [15]:
# testing the functions

result ={}
tweets = ['i am happy' , 'i am tricked', 'i am sad' ,'i am tired' , 'i am tired']
ys = [1,0,0,0,0]
count_tweets(result,tweets,ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

# part 2 Train YOUR MODEL USING NAIVE BAYES

In [16]:
# build a freqs dictionary for later uses
freqs = count_tweets({},train_x,train_y)


In [None]:
# UNQ_C2
def train_naive_bayes(freqs, train_x,train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    # calculate v , the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    v = len(vocab)

    # calculate n_pos and n_neg
    N_neg = N_pos = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
    
    # calculate D, the number of documents
    D = len(train_y)

    D_pos = (train_y == 1).sum()

    D_neg = D - D_pos

    #calculate logprior
    logprior = log(D_pos) - log(D_neg)

    for word in vocab:
        # get the positive and neg frquency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is pos and neg
        p_w_pos = (freq_pos + 1)/(N_pos + v)
        p_w_neg = (freq_neg + 1)/(N_neg + v)

        #calculate log likelihood of the word
        loglikelihood[word] = log(p_w_pos) - log(p_w_neg)



In [26]:
lookup(freqs,'sad',1)

5

In [27]:
freqs['sad',1]

5