In [1]:
import re
import string
import numpy as np

In [10]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import pandas as pd
from nltk.corpus import twitter_samples

In [12]:
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

# Function for preprocessing the tweets

In [6]:
def process_tweet(tweet):
  ps = PorterStemmer()
  stopwords_english = stopwords.words('english')
  tweet = re.sub(r'\$\w*','',tweet)
  tweet = re.sub(r'^RT[\s]+','',tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
  tweet = re.sub(r'#','',tweet)

  tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if(word not in stopwords_english and word not in string.punctuation):
      stem_word = ps.stem(word)
      tweets_clean.append(stem_word)
  return tweets_clean

# Function for building frequence dict for tweets

In [7]:
def build_freqs(tweets,ys):
  yslist = np.squeeze(ys).tolist()

  freqs = {}
  for y,tweet in zip(yslist,tweets):
    for word in process_tweet(tweet):
      pair = (word,y)
      if pair in freqs:
        freqs[pair]+=1
      else:
        freqs[pair]=1
  return freqs

#Preparing the data

In [14]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [15]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

In [16]:
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [19]:
train_y = np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis = 0)
test_y = np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis = 0)

In [20]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [21]:
freqs = build_freqs(train_x,train_y)

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11346


In [22]:
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


# Creating functions for doing Logistic Regression

In [23]:
def sigmoid(z):
  h = 1/(1+np.exp(-z))

  return h

In [24]:
def gradientDescent(x,y,theta,alpha,num_iters):
  m =  x.shape[0]

  for i in range(0,num_iters):
    z = np.dot(x,theta)
    h = sigmoid(z)

    j = (-1./m)*(np.dot(y.transpose(),np.log(h))+np.dot((1-y).transpose(),np.log(1-h)))

    theta = theta-(alpha/m)*np.dot(x.transpose(),(h-y))

  j = float(j)

  return j,theta

# Extracting the features for doing LR

In [26]:
def extract_features(tweet,freqs):
  word_l = process_tweet(tweet)

  x = np.zeros((1,3))

  x[0,0] =1 

  for word in word_l:
    x[0,1] +=  freqs.get((word,1.0),0)

    x[0,2] += freqs.get((word,0.0),0)

  return x

In [29]:
X = np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
Y = train_y

# Training the model

In [31]:
j,theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {j:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


# Function for predicting the sentiment score

In [32]:
def predict_tweet(tweet, freqs,theta):
  x = extract_features(tweet,freqs)
  y_pred = sigmoid(np.dot(x,theta))

  return y_pred

In [33]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530898
great great great -> 0.546273
great great great great -> 0.561561


# Final function for providing the sentiment prediction

In [34]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else: 
            y_hat.append(0)   

    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)

    return accuracy


In [35]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950
