In [1]:
#Import Modules
import numpy as np
import nltk
from nltk.corpus import twitter_samples
from package import process_tweets,lookup
from sklearn.metrics import accuracy_score

print("All Modules Imported Successfully!")

All Modules Imported Successfully!


In [2]:
#Load dataset
all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")

In [3]:
#Splitting the dataset in training and test set
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]
test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [4]:
#adding labels to data
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg), 1)), axis=0)

In [5]:
#Creating a function to count frequencies
def count_tweets(tweets, ys):
    result={}
    for y, tweet in zip(ys, tweets):
        for word in process_tweets(tweet):
            pair = (word, y)
            
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
                
    return result

In [6]:
#Checking our count_tweets function
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [7]:
#Getting the frequency distribution of our dataset
freqs = count_tweets(train_x,train_y.ravel())

In [8]:
#Checking our frequency table
freqs

{('followfriday', 1.0): 23,
 ('top', 1.0): 30,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2960,
 ('hey', 1.0): 60,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 81,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 522,
 ('listen', 1.0): 15,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 57,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 ('15', 1.0): 4,
 ('day', 1.0): 187,
 ('one', 1.0): 92,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 16,
 ('like', 1.0): 187,
 ('keep', 1.0): 55,
 ('love', 1.0): 336,
 

In [9]:
#Training our naive bayes classifier
def naive_bayes_classifier(freqs, train_x, train_y):
    
    loglikelihood={}
    logprior=0
    
    vocabulary = set([pair[0] for pair in freqs.keys()])
    V = len(vocabulary)
    
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in  freqs.keys():
        if pair[1]>0:
            V_pos += 1
            N_pos += freqs[pair]
        else:
            V_neg += 1
            N_neg += freqs[pair]
        
    D = len(train_y)
    
    D_pos = len(list(filter(lambda x: x>0, train_y)))
    D_neg = len(list(filter(lambda x: x<=0, train_y)))
    
    logprior = np.log(D_pos/D_neg)
    
    for word in vocabulary:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        
        p_w_pos = (freq_pos + 1)/(N_pos + V)
        p_w_neg = (freq_neg + 1)/(N_neg + V)
        
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood

In [10]:
logprior, loglikelihood = naive_bayes_classifier(freqs, train_x, train_y)

In [11]:
#Creating a predition functio  for our classifier
def naive_bayes_predict(tweet, logprior, loglikehood):
    tweet = process_tweets(tweet)
    prob = 0
    prob += logprior
    for word in tweet:
        if word in loglikelihood:
            prob += loglikelihood[word]
    return prob

In [12]:
#Checking prediction function
tweet= "she smiled."
print(naive_bayes_predict(tweet, logprior, loglikelihood))

1.5574658811595445


In [13]:
#Creating the test function
def naive_bayes_test(test_x, test_y, logprior, loglikelihood):
    accuracy = 0
    y_pred = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_pred_i = 1
        else:
            y_pred_i = 0
        y_pred.append(y_pred_i)
    
    return y_pred

In [14]:
#Prediction by model
y_pred = naive_bayes_test(test_x, test_y, logprior, loglikelihood)

In [15]:
#Comparing the predictions by model by actual labels
print(round(accuracy_score(y_pred, test_y)*100, 2))

99.55
