# The following code is about the review classification using Logistic Regression along with Natural Language Processing

Import the necessary packages

In [None]:
import nltk
from os import getcwd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string

Import the dataset which is stored in the Desktop Location. This location may differ based on the file location

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

The next step involves sorting all the positive and negative reviews, dividing the dataframe into 2 parts and extracting the reviews into a string

In [None]:
#Sorting the reviews into positive and negative
df.sort_values(by=['sentiment'], inplace=True)

#Splitting the dataset into positive and negative dataset
df_negative = df.iloc[:25000]
df_positive = df.iloc[25000:]

#Converting the reviews into list from the dataframe
all_positive_reviews = df_positive["review"].tolist()
all_negative_reviews = df_negative["review"].tolist()

Splitting and preparing the data for training

In [None]:
#Splitting the data into test set and training set
test_pos = all_positive_reviews[:5000]
train_pos = all_positive_reviews[5000:]
test_neg = all_negative_reviews[:5000]
train_neg = all_negative_reviews[5000:]

#Combining both the training and the testing data
x_train = train_pos + train_neg
x_test = test_pos + test_neg

#Initialize the value of y = 1 when the review is positive and y = 0 when the review is negative
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis = 0)

Process tweet function where input is a string containing a review and output is a list of words containing the processed review.

In [None]:
def process_tweet(tweet):
   
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

Frequency function mapping each word pair to its frequency

In [None]:
def build_freqs(tweets, ys):
    
    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [None]:
# create frequency dictionary
freqs = build_freqs(x_train,y_train)

Sigmoid Function

In [None]:

def sigmoid(z):
    h = 1/(1+np.exp(-z))
    
    return h

In [None]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [None]:
#Feature extraction
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [None]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(x_train), 3))
for i in range(len(x_train)):
    X[i, :]= extract_features(x_train[i], freqs)

# training labels corresponding to X
Y = y_train
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

In [None]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    ### END CODE HERE ###
    
    return y_pred

In [None]:
# Put your review here
my_tweet = 'This movie is one of the worst movies ever'
y_hat = predict_tweet(my_tweet, freqs, theta)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')