In [23]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import nltk

In [4]:
nltk.download('twitter_samples')
twitter_samples.fileids()

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [5]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
#Prepare train and test data
all_tweets = positive_tweets + negative_tweets
all_sentiments = np.ones(5000).tolist() + np.zeros(5000).tolist()
from sklearn.model_selection import train_test_split
train_x,test_x,train_y, test_y = train_test_split(all_tweets,all_sentiments,random_state=101, test_size=0.2)

In [7]:
import string
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
    """
    Generate tokens from the given tweet
    input: 
        tweet: A string containing a tweet
    output: Tokens after
        clean_tokens: Tokens of the processed tweet
    """
    stopwords_list = stopwords.words('english')
    stemmer = PorterStemmer()
    tweet = re.sub(r'RT\s*','',tweet)
    tweet = re.sub(r'\$\S*','',tweet)
    tweet = re.sub(r'https?:\/\/\S*','',tweet)
    tweet = re.sub(r'#','',tweet)
    tokenizer = TweetTokenizer(reduce_len=True,preserve_case=False, strip_handles=True)
    tokens = tokenizer.tokenize(tweet)
    clean_tokens = []
    for word in tokens:
        if (word not in stopwords_list) and (word not in string.punctuation):
            clean_tokens.append(stemmer.stem(word))
    return clean_tokens

In [8]:
#Frequency dictionary
def build_freqs(tweets,sentiments):
    """
    Build positive and negative frequencies of each word in the corpus
    Input:
        tweets: A list of tweets
        sentiments: a list of corresponding sentiments
    Output:
        freqs: A dictionary containing frequencies --> (word,sentiment):frequency
    """
    freqs = {}
    for tweet,sentiment in zip(tweets,sentiments):
        for word in process_tweet(tweet):
            freqs[(word,sentiment)] = freqs.get((word,sentiment),0)+1
    return freqs
            

Calculate log(P(𝑊𝑝𝑜𝑠)/P(𝑊𝑛𝑒𝑔)) for each word, where
$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V} $$$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V} $$

$$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)\tag{6}$$

In [16]:
def train_naive_bayes(freqs, train_x,train_y):
    """
    train the model given freqs dict,tweets and respective sentiments 
    Input:
    freqs: dictionary containing frequencies of words in corpus
    train_x: tweets to be trained
    train_y: labels of the tweets
    Output:
    loglikelihood: A dictionary containing log likelihoods of each word
    logprior: ratio of positive tweets and negative tweets; removes bias if the dataset is unbalanced
    """
    Npos = 0
    Nneg = 0
    for key in freqs.keys():
        if key[1] == 1:
            Npos += freqs.get(key)
        else:
            Nneg += freqs.get(key)
    unique_words = set([key[0] for key in freqs.keys()])
    V = len(unique_words)
    Dpos = len([y for y in train_y if y==1])
    Dneg = len(train_y) - Dpos
    logprior = np.log(Dpos/Dneg)
    loglikelihood = {}
    for word in unique_words:
        P_Wpos = (freqs.get((word,1),0) + 1)/(Npos+V)
        P_Wneg = (freqs.get((word,0),0) + 1)/(Nneg+V)
        loglikelihood[word] = np.log(P_Wpos/P_Wneg)
    return logprior, loglikelihood

In [13]:
freqs = build_freqs(train_x, train_y)

In [17]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

Calculate the probability of tweet being positive/negative by adding all the loglikelihoods of all words in the tweet. If Loglikelihood>0 then tweet is positive otherwise negative

In [18]:
def predict_naive_bayes(tweet,logprior,loglikelihood):
    """
    predict loglikelihood of tweet being positive given trained loglikelihood and logprior
    Input:
    tweet: input tweet
    logprior: trained logprior
    loglikelihood: trained likelihood
    Output:
    p: logliklihood of tweet being positive
    """
    p = logprior
    for word in process_tweet(tweet):
        p += loglikelihood.get(word,0)
    return p

In [28]:
def test_naive_bayes(test_x,test_y,logprior,loglikelihood):
    """
    test the model
    Input:
    test_x: unseen tweets for validation
    test_y: sentiments of tweets
    logprior: trained logprior
    loglikelihood: trained likelihood
    Output:
    accuracy: accuracy of the model on test set
    """
    pred = []
    for tweet in test_x:
        p = predict_naive_bayes(tweet,logprior,loglikelihood)
        if p > 0:
            pred.append(1)
        else:
            pred.append(0)
    accuracy = sum(np.squeeze(pred)==np.squeeze(test_y))/len(test_y)
    accuracy = accuracy * 100
    return accuracy

In [27]:
print("Accuracy of the naive bayes model is {}%".format(test_naive_bayes(test_x,test_y,logprior, loglikelihood)))

Accuracy of the naive bayes model is 99.7%
