In [55]:

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples

In [42]:
#Download Twitter samples
nltk.download('twitter_samples')
print("Fields ",twitter_samples.fileids())

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
Fields  ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [43]:
# Get positive and negative tweets from respective json files
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [44]:
# Take a look at sample tweets
positive_tweets[:5]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

In [45]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from nltk.tokenize import TweetTokenizer
#Function for preprocessing the data and to get feature vectors
def process_tweet(tweet):
    """
    Generate tokens from the given tweet
    input: 
        tweet: A string containing a tweet
    output: Tokens after
        clean_tokens: Tokens of the processed tweet
    """
    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()
    #Remove hyperlinks, Retweet "RT" text, #, $stock tickers
    tweet = re.sub(r'\$\w*','',tweet) # Removes words like $StockTicker #\w --> word character
    tweet = re.sub(r'RT\s*','',tweet) # Remove RT text # \s --> space character
    tweet = re.sub(r'https?:\/\/\S*','',tweet) # Removes links # ? --> 0 or 1 occurence of previous charcter #\S any character except space characters (opposite of \s)
    tweet = re.sub(r'#','',tweet)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    tokens = tokenizer.tokenize(tweet)
    clean_tokens = []
    for word in tokens:
        if (word not in stop_words) and (word not in string.punctuation):
            clean_tokens.append(stemmer.stem(word))
    return clean_tokens


In [46]:
#Function for preprocessing the data and to get feature vectors
def build_frequencies(tweets, sentiments):
    """
    Build positive and negative frequencies of each word in the corpus
    Input:
        tweets: A list of tweets
        sentiments: a list of corresponding sentiments
    Output:
        freqs: A dictionary containing frequencies --> (word,sentiment):frequency
    """
    freqs = {}
    for tweet,sentiment in zip(tweets,sentiments):
        for token in process_tweet(tweet):
            freqs[(token,sentiment)] = freqs.get((token,sentiment),0) + 1
        
    return freqs

In [47]:
#Combine positive and negative tweets and prepare train and test sets:
#positive_tweets.extend(negative_tweets)
all_tweets = positive_tweets + negative_tweets
all_sentiments = np.ones(5000).tolist() + np.zeros(5000).tolist()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_tweets,all_sentiments,test_size=0.2,random_state=101)
print("length of training tweets : ",len(X_train))
print("length of testing tweets : ",len(X_test))
for tweet,label in zip(X_train[:5],y_train[:5]):
    print(" {} {} ".format(tweet,label))

length of training tweets :  8000
length of testing tweets :  2000
 @Uber @Walls Why is there no icecream in Leeds? :( 0.0 
 Pengen boxing :( (at @golds_indonesia) — https://t.co/qXG4UNA4Fn 0.0 
 @vapemestoopid Ok,the first time we chat,and then i made such a joke lol .I believe you wont forget me,will u ? :) my name is @DannaQiu 1.0 
 @iFazy nhe Yar :( 0.0 
 ♛♛♛
》》》》 
I LOVE YOU SO MUCH.
I BELİEVE THAT HE WİLL FOLLOW.
PLEASE FOLLOW ME PLEASE JUSTİN @justinbieber :( x15.340
》》》》ＳＥＥ ＭＥ
♛♛♛ 0.0 


In [48]:
freqs = build_frequencies(X_train, y_train)

**Logistic Regression:**
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$

Sigmoid
$$ h(z) = \frac{1}{1+\exp^{-z}}$$ 

Cost Function
$$J(\theta) = -\frac{1}{m} \sum_{i=1}^m y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)})) $$

Gradient of the Cost function:
$$\nabla_{\theta_j}J(\theta) = \frac{1}{m} \sum_{i=1}^m(h^{(i)}-y^{(i)})x_j $$

Update the weights by subtracting fraction of derivative of loss function from weights:
$$\theta_j = \theta_j - \alpha \times \nabla_{\theta_j}J(\theta) $$

In [49]:
def gradient_descent(x,y,theta,alpha, num_iters):
    """
    Perform gradient descent of logistic regression
    Input:
        x: feature matrix(m,n+1), n+1 --> number of features including the bias
        y: True label(m,1)
        theta: Weights(n+1,1)
        alpha: Learning rate
        num_iters: Number of iteration you want to train your model for
    Output:
        J: Final loss after num_iters
        theta: Adjusted weights after num_iters
    """
    m = x.shape[0]
    for i in range(num_iters):
        z = np.dot(x,theta)
        h = 1/(1+np.exp(-z))
        J = -1./m * (np.dot(y.transpose(),np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))
        theta = theta - (alpha/m) * np.dot(x.transpose(), (h - y))
        print(" Loss {} in iteration {}".format(J, i))
    J = float(J)
    return J, theta
    

In [50]:
def extract_features(tweet,freqs):
    """
    Extract features for each tweet using the freqs dictionary
    Input: 
    tweet: A string containing a tweet
    freqs: Frequency dictionary containing frequencies for each word -->(word,label):freq
    Output:
    x: feature vector(1,3)
    """
    x = np.zeros((1,3))
    x[0,0] = 1 #bias
    for word in process_tweet(tweet):
        x[0,1] += freqs.get((word,1),0)
        x[0,2] += freqs.get((word,0),0)
    return x

In [51]:
# extract features from each tweet and append to feature matrix X
m = len(X_train)
X = np.zeros((m,3))
for i in range(m):
    X[i,:] = extract_features(X_train[i],freqs)
Y = np.array(y_train)
Y = Y.reshape(-1,1)
print("Y shape: ",Y.shape)
#Gradient Descent
J, theta = gradient_descent(X,Y,np.zeros((3,1)),1e-9,1000)
print("Final training loss ",J)

Y shape:  (8000, 1)
 Loss [[0.69314718]] in iteration 0
 Loss [[0.6920177]] in iteration 1
 Loss [[0.6908917]] in iteration 2
 Loss [[0.68976917]] in iteration 3
 Loss [[0.68865009]] in iteration 4
 Loss [[0.68753445]] in iteration 5
 Loss [[0.68642224]] in iteration 6
 Loss [[0.68531345]] in iteration 7
 Loss [[0.68420806]] in iteration 8
 Loss [[0.68310608]] in iteration 9
 Loss [[0.68200747]] in iteration 10
 Loss [[0.68091224]] in iteration 11
 Loss [[0.67982037]] in iteration 12
 Loss [[0.67873184]] in iteration 13
 Loss [[0.67764666]] in iteration 14
 Loss [[0.6765648]] in iteration 15
 Loss [[0.67548625]] in iteration 16
 Loss [[0.67441101]] in iteration 17
 Loss [[0.67333906]] in iteration 18
 Loss [[0.67227039]] in iteration 19
 Loss [[0.67120498]] in iteration 20
 Loss [[0.67014284]] in iteration 21
 Loss [[0.66908393]] in iteration 22
 Loss [[0.66802826]] in iteration 23
 Loss [[0.66697582]] in iteration 24
 Loss [[0.66592658]] in iteration 25
 Loss [[0.66488054]] in iterati

 Loss [[0.45915729]] in iteration 314
 Loss [[0.45867605]] in iteration 315
 Loss [[0.45819592]] in iteration 316
 Loss [[0.45771691]] in iteration 317
 Loss [[0.45723901]] in iteration 318
 Loss [[0.45676222]] in iteration 319
 Loss [[0.45628653]] in iteration 320
 Loss [[0.45581194]] in iteration 321
 Loss [[0.45533845]] in iteration 322
 Loss [[0.45486605]] in iteration 323
 Loss [[0.45439473]] in iteration 324
 Loss [[0.45392451]] in iteration 325
 Loss [[0.45345536]] in iteration 326
 Loss [[0.4529873]] in iteration 327
 Loss [[0.45252031]] in iteration 328
 Loss [[0.45205439]] in iteration 329
 Loss [[0.45158953]] in iteration 330
 Loss [[0.45112575]] in iteration 331
 Loss [[0.45066302]] in iteration 332
 Loss [[0.45020135]] in iteration 333
 Loss [[0.44974074]] in iteration 334
 Loss [[0.44928118]] in iteration 335
 Loss [[0.44882266]] in iteration 336
 Loss [[0.44836519]] in iteration 337
 Loss [[0.44790877]] in iteration 338
 Loss [[0.44745337]] in iteration 339
 Loss [[0.446

 Loss [[0.34222967]] in iteration 655
 Loss [[0.34198747]] in iteration 656
 Loss [[0.34174569]] in iteration 657
 Loss [[0.34150434]] in iteration 658
 Loss [[0.3412634]] in iteration 659
 Loss [[0.34102289]] in iteration 660
 Loss [[0.34078279]] in iteration 661
 Loss [[0.34054311]] in iteration 662
 Loss [[0.34030385]] in iteration 663
 Loss [[0.34006501]] in iteration 664
 Loss [[0.33982658]] in iteration 665
 Loss [[0.33958856]] in iteration 666
 Loss [[0.33935096]] in iteration 667
 Loss [[0.33911377]] in iteration 668
 Loss [[0.33887698]] in iteration 669
 Loss [[0.33864061]] in iteration 670
 Loss [[0.33840465]] in iteration 671
 Loss [[0.3381691]] in iteration 672
 Loss [[0.33793395]] in iteration 673
 Loss [[0.33769921]] in iteration 674
 Loss [[0.33746487]] in iteration 675
 Loss [[0.33723094]] in iteration 676
 Loss [[0.3369974]] in iteration 677
 Loss [[0.33676428]] in iteration 678
 Loss [[0.33653155]] in iteration 679
 Loss [[0.33629922]] in iteration 680
 Loss [[0.33606

 Loss [[0.28163011]] in iteration 976
 Loss [[0.28148358]] in iteration 977
 Loss [[0.28133725]] in iteration 978
 Loss [[0.28119113]] in iteration 979
 Loss [[0.28104522]] in iteration 980
 Loss [[0.28089951]] in iteration 981
 Loss [[0.280754]] in iteration 982
 Loss [[0.2806087]] in iteration 983
 Loss [[0.28046359]] in iteration 984
 Loss [[0.2803187]] in iteration 985
 Loss [[0.280174]] in iteration 986
 Loss [[0.2800295]] in iteration 987
 Loss [[0.27988521]] in iteration 988
 Loss [[0.27974112]] in iteration 989
 Loss [[0.27959723]] in iteration 990
 Loss [[0.27945354]] in iteration 991
 Loss [[0.27931005]] in iteration 992
 Loss [[0.27916675]] in iteration 993
 Loss [[0.27902366]] in iteration 994
 Loss [[0.27888077]] in iteration 995
 Loss [[0.27873807]] in iteration 996
 Loss [[0.27859557]] in iteration 997
 Loss [[0.27845327]] in iteration 998
 Loss [[0.27831116]] in iteration 999
Final training loss  0.27831116480021495


In [52]:
def predict_tweet(tweet, freqs, theta):
    """
    Input:
    tweet: a string containg a tweet
    freqs: frequency dictionary containing +ve,-ve frequencies of all words in the corpus
    theta" trained weights
    Output:
    pred: probability of input tweet being +ve or -ve
    """
    x = extract_features(tweet, freqs)
    pred = 1/(1+np.exp(-(np.dot(x,theta))))
    return pred

In [53]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print(" {}  {} ".format(tweet, predict_tweet(tweet, freqs,theta)))

 I am happy  [[0.51613003]] 
 I am bad  [[0.49453856]] 
 this movie should have been great.  [[0.51251768]] 
 great  [[0.51281724]] 
 great great  [[0.52561764]] 
 great great great  [[0.53838445]] 
 great great great great  [[0.55110112]] 


In [54]:
#Check performance on test set
y_hat = []
for tweet in X_test:
    y_hat.append(predict_tweet(tweet, freqs,theta) >0.5)
accuracy = (np.squeeze(y_hat) == np.squeeze(y_test)).sum()/len(X_test)
print("Test accuracy of twitter analysis is {}%: ".format(accuracy*100))

Test accuracy of twitter analysis is 99.5%: 
