In [3]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

%run process.ipynb
%run frequency.ipynb

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Sidharth\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sidharth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#There are 5000 positive and 5000 negative tweets. 80% for training and 20% for test set.
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

train_pos = pos_tweets[:4000]
train_neg = neg_tweets[:4000]
test_pos = pos_tweets[4000:]
test_neg = neg_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis = 0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis = 0)

print("No. of training tweets: " + str(len(train_x)))
print("No. of test tweets: " + str(len(test_x)))
print("train_y: " + str(train_y.shape))
print("test_y: " + str(test_y.shape))

No. of training tweets: 8000
No. of test tweets: 2000
train_y: (8000, 1)
test_y: (2000, 1)


In [5]:
freq = create_freq_dict(train_x, train_y)

In [6]:
#Example of preprocessing a tweet
print(train_x[5])
new_tweet = preprocess_tweet(train_x[5])
print("\nProcessed tweet: " + str(new_tweet))

@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM

Processed tweet: ['one', 'irresist', ':)', 'flipkartfashionfriday']


In [13]:
def sigmoid(x):
    z = 1/(1 + np.exp(-x))
    return z


def gradient_descent(x, y, alpha, no_of_iter):
    w = np.zeros((3,1))
    m = x.shape[0]
    
    for i in range (0, no_of_iter):
        z = np.dot(x, w)
        h = sigmoid(z)
        J = -1/m * (np.dot((y.T),np.log(h)) + np.dot(((1-y).T),np.log(1-h)))
        w = w - (alpha/m) * (np.dot((x.T),(h-y)))
    
    J = float(J)
    return J, w


def feature_values(tweet):
    new_tweet = preprocess_tweet(tweet)
    x = np.zeros((1,3))
    x[0,0] = 1
    
    for word in new_tweet:
        x[0,1] += freq.get((word,1.0), 0)
        x[0,2] += freq.get((word,0.0), 0)
    
    assert(x.shape == (1,3))
    return x

In [43]:
X = np.zeros((len(train_x), 3))
for i in range (0, len(train_x)):
    X[i, :] = feature_values(train_x[i])

J, W = gradient_descent(X, train_y, 1e-9, 1500)
print(f"Cost= {J:.6f}")
print(f"Weights: {np.squeeze(W)}")

#To find accuracy
y_pred = []
for tweet in test_x:
    if predict(tweet) > 0.5:
        y_pred.append(1.0)
    else:
        y_pred.append(0.0)

accuracy = (y_pred == np.squeeze(test_y)).sum()/len(test_x)
print(f"Accuracy= {accuracy}")    

Cost= 0.242165
Weights: [ 7.25244119e-08  5.23898414e-04 -5.55171267e-04]
Accuracy= 0.995


In [18]:
def predict(tweet):
    x = feature_values(tweet)
    y = sigmoid(np.dot(x,W))
    if y > 0.5:
        return 1
    else:
        return 0

In [44]:
new_tweet = 'I am feeling happy and excited'
if predict(mytweet) == 1:
    print('Positive')
else:
    print('Negative')

Positive
