<a href="https://colab.research.google.com/github/rjtsharma3538/twitter_sentiment_analysis_using_logistic_regression/blob/main/twitter_sentimental_analysis_using_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from os import getcwd
nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import numpy as np
import pandas as pd


In [None]:
from nltk.corpus import twitter_samples

In [None]:
# select set of positive and negative tweets
positive_tweet = twitter_samples.strings('positive_tweets.json')
negative_tweet = twitter_samples.strings('negative_tweets.json')

# each positive and negative will contain 5000 tweets each


In [None]:
# training an testing data classification  20% for testing and 80% for training
test_positive = positive_tweet[4000:]
train_positive = positive_tweet[:4000]
test_negative = negative_tweet[4000:]
train_negative = negative_tweet[:4000]

train_x = train_positive + train_negative
test_x = test_positive + test_negative

In [None]:
# this is the output for every tweet in training set and testing set

train_y = np.append(np.ones((len(train_positive), 1)), np.zeros((len(train_negative), 1)), axis=0)
test_y = np.append(np.ones((len(test_positive), 1)), np.zeros((len(test_negative), 1)), axis=0)

In [None]:
print("train_shape = " + str(train_y.shape))

train_shape = (8000, 1)


In [None]:
print("test_shape = " + str(test_y.shape))

test_shape = (2000, 1)


In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [None]:
import string
import re

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [None]:
# pre processing of tweet

def process(tweet):
  stemmer = PorterStemmer()
  stopwords_english = stopwords.words('english')
  tweet = re.sub(r'\$\w*', '', tweet)                     # remove stock market tickers like $GE
  tweet = re.sub(r'^RT[\s]+', '', tweet)                  # remove old style retweet text
  tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)       # remove hyperlinks
  tweet = re.sub(r'#', '', tweet)                         # remove # sign

  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweet_clean = []

  for word in tweet_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
      stem_word = stemmer.stem(word)
      tweet_clean.append(stem_word)
  return tweet_clean



In [None]:
print(train_x[0])
print(process(train_x[0]))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [None]:
# frequency function , it consist of frequency of every word in positive and negative class

def build_freq(tweets, ys):
  freq={}
  yslist = np.squeeze(ys).tolist()

  for y,tweet in zip(yslist,tweets):
    for word in process(tweet):
      pair = (word,y)
      if pair in freq:
        freq[pair]+=1
      else:
        freq[pair]=1

  return freq

In [None]:
# frequcny dictionary formation

freqs=build_freq(train_x,train_y)

In [None]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11427


In [None]:
# sigmoid function

def sigmoid(x):
  h = 1/(1 + np.exp(-x))
  return h

In [None]:
print(sigmoid(2))

0.8807970779778823


In [None]:
# extraction of input features in the form if [1,(sum of positive freq in dict of all words of tweet), (sum of negative freq in dict of all words of tweet)]

def extract_features(tweets, freq):
  tweet = process(tweets)
  pos=0
  neg=0
  for word in tweet:
    if (word,1) in freq:
      pos+=freq[(word,1)]
    if (word,0) in freq:
      neg+=freq[(word,0)]
  return [1,pos,neg]

In [None]:
# gradient descent function , updating theta and minimising cost

def gradient_descent(x, y, theta, alpha, iterations):
  m=len(x)
  for i in range(0,iterations):
    z=np.dot(x,theta)
    h=sigmoid(z)
    J = (-1/m)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
    theta = theta - (alpha/m)*np.dot(x.T, h-y)

    J = float(J)

    return J,theta

In [None]:
X = np.zeros((len(train_x),3))
Y = train_y
for i in range(len(train_x)):
  X[i,:] = extract_features(train_x[i], freqs)
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)


print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")


The cost after training is 0.69314718.
The resulting vector of weights is [0.0, 6.2e-07, -8.7e-07]


In [None]:
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

[1, 3133, 61]


In [None]:
def predict_tweet(tweet, freqs, theta):

    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)

    # make the prediction using x and theta
    z = np.dot(x,theta)
    y_pred = sigmoid(z)


    return y_pred

def test_logistic_regression(test_x, test_y, freqs, theta):

    # the list for storing predictions
    y_hat = []

    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)

        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)
# With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    y_hat = np.array(y_hat)
    test_y = test_y.reshape(-1)
    accuracy = np.sum((test_y == y_hat).astype(int))/len(test_x)

    return accuracy

In [None]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(tmp_accuracy)

0.992


In [None]:
tweet = input()
result = predict_tweet(tweet, freqs, theta)
if result > 0.5:
  print(1)
else :
  print(0)

i am a good boy
1
