In [1]:
import nltk
from os import getcwd

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to C:\Users\SREE
[nltk_data]     BHATTACHARYYA\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SREE
[nltk_data]     BHATTACHARYYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [38]:
def process_tweet(tweet):
    """Processes a tweet.
        Input: a string containing a tweet
        Output: a list of words which is the processed tweet"""
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #removing hyperlinks
    tweet = re.sub(r'hhtps?:\/\/.*[\r\n]*','',tweet)
    #remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*','',tweet)
    #remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+','',tweet)
    #remove hashtags
    #removing hash sign from a word
    tweet = re.sub(r'#','', tweet)
    #removing numbers
    tweet = re.sub(r'[0-9]','',tweet)
    #tokenizing tweets
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if((word == "not" or word not in stopwords_english) and
              word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
            
    return tweets_clean

In [6]:
def build_freqs(tweets,ys):
    """Builds frequencies for every word in a tweet
        Input: 
            tweets: a list of tweets
            ys: m x 1 array having sentiment label of each tweet
        Output: 
            freqs: dictionary mapping each pair (word,sentiment) to its frequency
    """
    #converting np array to list as zipping will need iterable
    #squeeze is also necessary or list will end up with one element
    #if ys is already a list, squeezing will be NOP
    
    yslist = np.squeeze(ys).tolist()
    
    #Starting with empty dictionary, then populating it
    freqs = {}
    for y,tweet in zip(yslist,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in  freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [7]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [8]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos+test_neg

In [9]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis = 0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis = 0)


In [10]:
print(f"train_y.shape = {train_y.shape}")
print(f"test_y.shape = {test_y.shape}")

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [11]:
#creating frequency dictionary 
freqs = build_freqs(train_x,train_y)

#checking the output
print(f"type of freqs is: {type(freqs)}")
print(f"Length of freqs is: {len(freqs)}")

type of freqs is: <class 'dict'>
Length of freqs is: 12779


In [12]:
#giving example of tweet processing
print("Following is an example of a positive tweet: \n",train_x[0])
print("Same tweet after processing:\n", process_tweet(train_x[0]))

Following is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Same tweet after processing:
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [13]:
#logistic regression: sigmoid function
def sigmoid(z):
    """Input: z (scalar/vector)
       Output: h :sigmoid of z """
    h = None
    h = 1/(1+np.exp(-z))
    
    return h

In [14]:
#calculating gradient descent
def gradientDescent(x,y,theta,alpha,num_iters):
    """
    Input: 
        x: matrix of features which is m x n+1
        y: corresponding labels of the input matrix x, it is mx1
        theta: weight vector, of dimension n+1 x 1
        alpha: learning rate
        num_iters: number of times whole training set is scanned
    Output:
        J: final cost
        theta: updated weights vector
    """
    m = x.shape[0]
    for i in range(num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -(1./m)*(np.dot(np.transpose(y),np.log(h)) + np.dot(np.transpose(1-y),np.log(1-h)))
        theta = theta - (alpha/m)* (np.dot(np.transpose(x),(h-y)))
        print(f"Current cost: {J}")
    J = float(J)
    return J,theta
        

In [15]:
#feature extraction
def extract_features(tweet,freqs):
    """
    Input: 
        tweet: a single tweet (list of words)
        freqs: the dictionary corresponding to frequencies
    Output:
        x: a feature vector of dimension (1,3)
    """
    
    #processing tweet, tokenizing, stemming, removing stopwords
    word_l = process_tweet(tweet)
    
    x = np.zeros((1,3))
    
    x[0,0] = 1
    
    for word in word_l:
        if (word, 1) in freqs:
             x[0,1] += freqs[(word,1)]
        if (word,0) in freqs:
            x[0,2] += freqs[(word,0)] 
    assert(x.shape == (1,3))
    return x
        

In [16]:
#testing
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

[[1.000e+00 3.133e+03 6.100e+01]]


In [17]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


In [18]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i,:] = extract_features(train_x[i],freqs)
    
Y = train_y

J, theta = gradientDescent(X,Y, np.zeros((3,1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t,8) for t in np.squeeze(theta)]}")

Current cost: [[0.69314718]]
Current cost: [[0.69201194]]
Current cost: [[0.69088022]]
Current cost: [[0.68975199]]
Current cost: [[0.68862724]]
Current cost: [[0.68750597]]
Current cost: [[0.68638816]]
Current cost: [[0.68527379]]
Current cost: [[0.68416287]]
Current cost: [[0.68305537]]
Current cost: [[0.68195128]]
Current cost: [[0.6808506]]
Current cost: [[0.67975331]]
Current cost: [[0.67865939]]
Current cost: [[0.67756885]]
Current cost: [[0.67648165]]
Current cost: [[0.67539781]]
Current cost: [[0.67431729]]
Current cost: [[0.67324009]]
Current cost: [[0.6721662]]
Current cost: [[0.67109561]]
Current cost: [[0.6700283]]
Current cost: [[0.66896426]]
Current cost: [[0.66790349]]
Current cost: [[0.66684597]]
Current cost: [[0.66579168]]
Current cost: [[0.66474062]]
Current cost: [[0.66369278]]
Current cost: [[0.66264813]]
Current cost: [[0.66160668]]
Current cost: [[0.66056841]]
Current cost: [[0.65953331]]
Current cost: [[0.65850137]]
Current cost: [[0.65747257]]
Current cost: [[0

Current cost: [[0.4080107]]
Current cost: [[0.40764039]]
Current cost: [[0.40727085]]
Current cost: [[0.40690209]]
Current cost: [[0.40653411]]
Current cost: [[0.40616689]]
Current cost: [[0.40580043]]
Current cost: [[0.40543474]]
Current cost: [[0.40506981]]
Current cost: [[0.40470564]]
Current cost: [[0.40434223]]
Current cost: [[0.40397957]]
Current cost: [[0.40361767]]
Current cost: [[0.40325652]]
Current cost: [[0.40289611]]
Current cost: [[0.40253645]]
Current cost: [[0.40217754]]
Current cost: [[0.40181937]]
Current cost: [[0.40146193]]
Current cost: [[0.40110524]]
Current cost: [[0.40074928]]
Current cost: [[0.40039405]]
Current cost: [[0.40003956]]
Current cost: [[0.39968579]]
Current cost: [[0.39933275]]
Current cost: [[0.39898043]]
Current cost: [[0.39862884]]
Current cost: [[0.39827797]]
Current cost: [[0.39792782]]
Current cost: [[0.39757838]]
Current cost: [[0.39722966]]
Current cost: [[0.39688165]]
Current cost: [[0.39653435]]
Current cost: [[0.39618775]]
Current cost: [

Current cost: [[0.29645009]]
Current cost: [[0.29628117]]
Current cost: [[0.29611249]]
Current cost: [[0.29594407]]
Current cost: [[0.2957759]]
Current cost: [[0.29560798]]
Current cost: [[0.29544031]]
Current cost: [[0.29527289]]
Current cost: [[0.29510572]]
Current cost: [[0.2949388]]
Current cost: [[0.29477213]]
Current cost: [[0.29460571]]
Current cost: [[0.29443953]]
Current cost: [[0.2942736]]
Current cost: [[0.29410792]]
Current cost: [[0.29394248]]
Current cost: [[0.29377729]]
Current cost: [[0.29361234]]
Current cost: [[0.29344764]]
Current cost: [[0.29328318]]
Current cost: [[0.29311897]]
Current cost: [[0.292955]]
Current cost: [[0.29279127]]
Current cost: [[0.29262778]]
Current cost: [[0.29246453]]
Current cost: [[0.29230153]]
Current cost: [[0.29213876]]
Current cost: [[0.29197623]]
Current cost: [[0.29181395]]
Current cost: [[0.2916519]]
Current cost: [[0.29149009]]
Current cost: [[0.29132851]]
Current cost: [[0.29116718]]
Current cost: [[0.29100608]]
Current cost: [[0.29

Current cost: [[0.24839922]]
Current cost: [[0.24829525]]
Current cost: [[0.24819141]]
Current cost: [[0.24808769]]
Current cost: [[0.24798409]]
Current cost: [[0.24788062]]
Current cost: [[0.24777727]]
Current cost: [[0.24767405]]
Current cost: [[0.24757095]]
Current cost: [[0.24746797]]
Current cost: [[0.24736512]]
Current cost: [[0.24726238]]
Current cost: [[0.24715978]]
Current cost: [[0.24705729]]
Current cost: [[0.24695493]]
Current cost: [[0.24685268]]
Current cost: [[0.24675057]]
Current cost: [[0.24664857]]
Current cost: [[0.24654669]]
Current cost: [[0.24644494]]
Current cost: [[0.2463433]]
Current cost: [[0.24624179]]
Current cost: [[0.24614039]]
Current cost: [[0.24603912]]
Current cost: [[0.24593797]]
Current cost: [[0.24583694]]
Current cost: [[0.24573602]]
Current cost: [[0.24563523]]
Current cost: [[0.24553456]]
Current cost: [[0.245434]]
Current cost: [[0.24533357]]
Current cost: [[0.24523325]]
Current cost: [[0.24513305]]
Current cost: [[0.24503297]]
Current cost: [[0

In [19]:
#testnig model
def predict_tweet(tweet, freqs, theta):
    """
    Input: 
        tweet: a string
        freqs: dictionary representing the frequencies
        theta: (3,1) vector of weights
    Output:
        y_pred: probability of tweet being positive/negative
    """
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [20]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.519277
I am bad -> 0.494346
this movie should have been great. -> 0.515981
great -> 0.516067
great great -> 0.532100
great great great -> 0.548067
great great great great -> 0.563936


In [21]:
#Checking by entering any tweet, can be personalized
my_tweet = 'I am very sad learning :('
predict_tweet(my_tweet, freqs, theta)

array([[0.10873861]])

In [22]:
#testing the logistic regression model
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: a mx1 vector with corresponding labels
        freqs: a dictionary with frequency of each pair
        theta: 3x1 dimension weight vector
    Output:
        accuracy: no. of tweets classified correctly/total no of tweets
    """
    pred = []
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            pred.append(1.0)
        else:
            pred.append(0.0)
    
    pred_arr = np.asarray(pred)
    y_arr = np.squeeze(test_y)
    accuracy = (np.sum(pred_arr == y_arr))/len(test_y)
    return accuracy

In [23]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [24]:
#Analyzing the errors
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x,freqs,theta)
    
    if np.abs(y - (y_hat>0.5)) > 0:
        print(f"The tweet is: {x}")
        print(f"The processed tweet is: {process_tweet(x)}")
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
The tweet is: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
The processed tweet is: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48901030	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
The tweet is: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
The processed tweet is: ["i'm", 'play', 'brain', 'dot', 'braindot', 'http://t.co/ugqzox0huu']
1	0.48418612	b"i'm play brain dot braindot http://t.co/ugqzox0huu"
The tweet is: I'm playing Brain Dots : ) #BrainDots http://t.co/aOKldo3GMj http://t.co/xWCM9qyRG5
The processed tweet is: ["i'm", 'play', 'brain', 'dot', 'braindot', 'http://t.co/aokldo3gmj', 'http://t.co/xwcm9qyrg5']
1	0.48418612	b"i'm play brain dot braindot http://t.co/aokldo3gmj http://t.co/xwc

In [41]:
#try with your own personal tweets
my_tweet = '2020 not sad @mynewname!'
print(my_tweet)
print(f"Processed tweet :{process_tweet(my_tweet)}")
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)

if y_hat>0.5:
    print('Positive statement')
else:
    print('Negative statement')

2020 not sad @mynewname!
Processed tweet :['not', 'sad']
[[0.48671625]]
Negative statement
