# Sentiment Analyzer

* Data: Amazon reviews
* But the problem has been simplified, instead of 5 stars, the output is classified only as positive or negative
* The only input is the "review_text", all other data is ignored
* BeautifulSoup is a XML Parser

In [None]:
import nltk
import numpy as np
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

def my_tokenizer(sentence):
    sentence = sentence.lower() # All sentences are now in lower case
    tokens = nltk.tokenize.word_tokenize(sentence) # Creates tokens
    tokens = [t for t in tokens if len(t) > 2] # Ignore all tokens with only one letter
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # Applies the lemmatizer
    tokens = [t for t in tokens if t not in stopwords.words('english') and t != ''] # Removes stop words and empty tokens
    return tokens

In [None]:
positive_reviews = BeautifulSoup(open('sorted_data_acl//electronics//positive.review').read())
positive_reviews = positive_reviews.findAll('review_text') # Uses only the review_text info

negative_reviews = BeautifulSoup(open('sorted_data_acl//electronics//negative.review').read())
negative_reviews = negative_reviews.findAll('review_text') # Uses only the review_text info

In [None]:
# Since we have more positive reviews than negatives, we trim the amount of positive reviews
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [None]:
# Dictionary containing every word in the vocabulary
word_index_map = {}
current_index = 0

positive_tokenized = []
negative_tokenized = []

In [None]:
# Creates the dictionary: defines the vocabulary size and the index of each word
# The vocabulary could still be reduced

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print(word_index_map)     

In [None]:
def tokens_to_vector(tokens, label):
    # Feature vector has the same size of the vocabulary (+1 for the label) and has all values set to 0
    x = np.zeros(len(word_index_map) + 1) 
    # Find the word in the dictionary, retrieve its index 
    # Change the value of the word for this particular X vector
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    # Normalizes X in relation to the total amount of words in the document
    x = x / x.sum()
    x[-1] = label
    return x

In [None]:
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0
                
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1
    
print("Rows (input size): ", len(data))
print("Columns (feature vector size): ", len(data[0]))

In [None]:
np.random.shuffle(data)

In [None]:
# Split data
# You can manually split or use sklearn function

#X = data[:, :-1]
#Y = data[:, -1:]

#X_train = X[:-100,]
#Y_train = Y[:-100,]

#X_test = X[-100:,]
#Y_test = Y[-100:,]


X_train, X_test, Y_train, Y_test = train_test_split(data[:, :-1], 
                                                    data[:, -1:], 
                                                    test_size = 0.25, 
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
# Linear Regression
model = LogisticRegression()
model.fit(X_train, Y_train)
print("Classification rate: ", model.score(X_test, Y_test))

In [None]:
# MLP with two layers
mlp_model = MLPClassifier(hidden_layer_sizes = (30, 15, ), 
                          activation = "relu", 
                          solver = "adam", 
                          alpha = 0.65, 
                          max_iter = 1000, 
                          random_state = 1234)

mlp_model.fit(X_train, Y_train)
print("Classification rate: ", mlp_model.score(X_test, Y_test))