## Sentiment Analysis on Movie Reviews using
- Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import re

### Movie Reviews dataset from IMDB

In [None]:
reviews = pd.read_csv("IMDB Dataset.csv")
reviews

### Positive:Negative plot

In [None]:
lenpos = sum(reviews['sentiment'] == 'positive')
lenneg = sum(reviews['sentiment'] == 'negative')
fig = plt.figure(figsize=(6, 6))
labels = 'Positives', 'Negative'
sizes = [lenpos, lenneg] 
explode = (0.05, 0.05)

plt.pie(sizes, explode = explode, shadow = True ,labels=labels, startangle=90, autopct= '%.1f')

plt.axis('equal')  
plt.show()

## Data Preprocessing
0. (split train/test)
1. Remove HTML tags
2. Remove stopwords
3. Remove Punctuation
4. Stemming words

In [None]:
reviews['review'] = reviews['review'].apply(lambda x: re.sub(r'<.*?>', '', reviews['review'][1]))

In [None]:
df_train = reviews.sample(frac = 0.7, random_state = 42)
df_test = reviews.drop(index = df_train.index)

x_train = df_train.drop('sentiment', axis = 1)
x_test = df_test.drop('sentiment', axis = 1)
y_train = df_train[['sentiment']]
y_test = df_test[['sentiment']]
x_train

### Create frequency dictionary and feature extraction function

In [None]:
def process(text):
    final_list = []
    token = word_tokenize(text)
    for word in token:
        if (word not in stopwords.words('english') \
             and word not in string.punctuation\
             and word not in " '', ' ', '  ' ,'s, `` A "):
            final_list.append(word)
    return final_list

In [None]:
def frequency_count(traindf, labeldf):
    n = 0
    freqdic = {}
    for rev in traindf['review']:
        tmp = process(rev)
        for word in tmp:
            if labeldf.reset_index()['sentiment'][n] == 'positive':
                try:
                    freqdic[(word, 1)] += 1
                except:
                    freqdic[(word, 1)] = 1
            else:
                try:
                    freqdic[(word, 0)] += 1
                except:
                    freqdic[(word, 0)] = 1
        n += 1
    return freqdic

In [None]:
def extractFeature(text, freqdic):
    words = process(text)
    x = np.zeros((1, 3)) 
    x[0,0] = 1
    for word in word_l:
        try:
            x[0,1] += freqs[(word,1)]
        except:
            pass
        try:
            x[0,2] += freqs[(word,0)]
        except:
            pass
        
    assert(x.shape == (1, 3))
    return x

## Modeling

In [None]:
def sigmoid(z): 
    f = 1 / (1 + np.exp(-z))
    return f

In [None]:
def gradientDescent(x, y, theta, alpha, iteration):
    m = len(x)
    for i in range(0, iteration):
        z = np.dot(x, theta)
        f = sigmoid(z)
        #cost func
        J = (-1/m) *(np.dot(np.transpose(y),np.log(f)) + np.dot(np.transpose(1-y),np.log(1-f)))
        # update the weights theta
        theta = theta-(alpha*(np.dot(np.transpose(x),(f-y))))/m
    J = float(J)
    return J, theta

In [None]:
def train_model(train_x, train_y, freqdic, theta, alpha, interation):
    X = np.zeros((len(train_x), 3))
    for i in range(len(train_x)):
        x[i, :] = extract_features(train_x['review'][i], freqdic)
    
    enc = OrdinalEncoder()
    Y = enc.fit_transform(train_y)
    
    J, theta = gradientDescent(X, Y, theta, alpha, iteration)
    
    return J, theta

## Prediction

In [None]:
# predictoin for one case
def predict(text, freqdic, theta):
    
    x = extract_features(text, freqdic)
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [None]:
def model_acc_test(test_x, test_y, freqdic, theta):
    for text in test_x:
        y_pred = predict(text, freqdic, theta)
        if y_pred > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0)
    
    tmp = np.array(y_hat) == test_y.flatten()
    acc = sum(tmp)/len(tmp)

    return acc

# Main!

In [None]:
freq_dic = frequency_count(x_train, y_train)
model = train_model(x_train, y_train,
                    freqdic, np.zeros((3, 1)),
                    1e-7, 800)

In [None]:
model_acc_test(x_test, 
               y_test,
               freqdic,
               theta)