<a href="https://colab.research.google.com/github/rohit11544/NQSI/blob/review-analysis/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# run this cell to import nltk
import nltk
from os import getcwd

In [None]:
# add folder, tmp2, from our local workspace containing pre-downloaded corpora files to nltk's data path
# this enables importing of these files without downloading it again when we refresh our workspace

filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [None]:
nltk.download('twitter_samples')  # downloading the tweets

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [None]:
nltk.download('stopwords')    # downloading the stop words 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [None]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [None]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [None]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [None]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11346


In [None]:
# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [None]:

def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
   
    # calculate the sigmoid of z
    h = 1/(1+np.exp(-z))

    
    return h

In [None]:

def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
   
    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1/m)*np.sum( (np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h) ) )  )
        
        # update the weights theta
        theta = theta-(alpha/m)*(np.dot(x.T,(h-y)))
        

    J = float(J)
    return J, theta

In [None]:

def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        if (word, 1.0) in freqs:
        # increment the word count for the positive label 1
            x[0,1] += freqs[(word, 1.0)]
        if (word,0) in freqs:
        # increment the word count for the negative label 0
            x[0,2] += freqs[(word, 0.0)]
        

    assert(x.shape == (1, 3))
    return x

In [None]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


In [None]:

def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''

    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))

    
    return y_pred

In [None]:
# testing the  code
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))



I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530898
great great great -> 0.546273
great great great great -> 0.561561


In [None]:
#  checking the sentiment of your own tweet below
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.81636424]])

In [None]:

def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """ 

    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0.0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    test_y = np.array(test_y)
    accuracy = 0
    for i in range(len(test_y)):
        if test_y[i]==y_hat[i]:
            accuracy+=1
    accuracy = accuracy/len(test_x)


    
    return accuracy

In [None]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


# positive review 
No, It is not at all ‘Bad’. the infrastructure has developed rapidly in the past year, making it a fully functional campus. The students here have achieved top ranks in the contests held around the country. Putting in the extra efforts(magnify it 2..3 times your JEE, I mean it), our seniors have secured admissions in best universities of the world (eg. MIT (USA), Carnegie Mellon University, etc.). You will love the IIIT system. You do not have the records, journals, etc. things to submit (where you just copy things from books). The assignments here are an actual learning experience and no waste of time and resources. The schedule at IIIT Sri City helps you focus on the important things too. You will have time for your academics and the ‘extra’ self-learning things as well and Yes, you can take out time for some recreation at weekends. Conclusion : Making correct choices at the college (peers, college groups), you will make the Best out of here.


In [None]:
my_tweet = 'No, It is not at all ‘Bad’. the infrastructure has developed rapidly in the past year, making it a fully functional campus. The students here have achieved top ranks in the contests held around the country. Putting in the extra efforts(magnify it 2..3 times your JEE, I mean it), our seniors have secured admissions in best universities of the world (eg. MIT (USA), Carnegie Mellon University, etc.). You will love the IIIT system. You do not have the records, journals, etc. things to submit (where you just copy things from books). The assignments here are an actual learning experience and no waste of time and resources. The schedule at IIIT Sri City helps you focus on the important things too. You will have time for your academics and the ‘extra’ self-learning things as well and Yes, you can take out time for some recreation at weekends. Conclusion : Making correct choices at the college (peers, college groups), you will make the Best out of here.'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['‘', 'bad', '’', 'infrastructur', 'develop', 'rapidli', 'past', 'year', 'make', 'fulli', 'function', 'campu', 'student', 'achiev', 'top', 'rank', 'contest', 'held', 'around', 'countri', 'put', 'extra', 'effort', 'magnifi', '2', '..', '3', 'time', 'jee', 'mean', 'senior', 'secur', 'admiss', 'best', 'univers', 'world', 'eg', 'mit', 'usa', 'carnegi', 'mellon', 'univers', 'etc', 'love', 'iiit', 'system', 'record', 'journal', 'etc', 'thing', 'submit', 'copi', 'thing', 'book', 'assign', 'actual', 'learn', 'experi', 'wast', 'time', 'resourc', 'schedul', 'iiit', 'sri', 'citi', 'help', 'focu', 'import', 'thing', 'time', 'academ', '‘', 'extra', '’', 'self-learn', 'thing', 'well', 'ye', 'take', 'time', 'recreat', 'weekend', 'conclus', 'make', 'correct', 'choic', 'colleg', 'peer', 'colleg', 'group', 'make', 'best']
[[0.51306727]]
Positive sentiment
