# Sentiment analysis with TFLearn
Sentiment analysis using TFLearn / NLTK

## Dependencies

In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

In [8]:
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)

### Counting word frequency

In [9]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string

stemmer = nltk.PorterStemmer() #convert words to their word stem
punctuations = list(string.punctuation) #remove punctuations
stopwords_col = stopwords.words('english') #remove meaningless wrods

total_counts = Counter()#words counter
for text in reviews[0]:
        for word in [word for word in word_tokenize(text) if word not in stopwords_col and word not in punctuations]:
            try:
                raw = stemmer.stem(word)
            except:
                raw = word
        total_counts[raw] += 1
print("Total words in data set: ", len(total_counts))

Total words in data set:  4858


## Bag of words

In [10]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:1000])

['br', 'movi', 'film', 'time', 'one', 'watch', 'see', 'recommend', 'enjoy', 'rate', 'star', 'like', 'disappoint', 'end', 'good', 'better', 'give', 'life', 'miss', 'fan', 'bad', 'stori', 'entertain', 'well', 'seen', 'much', 'way', 'work', 'ever', 'instead', 'year', 'avoid', 'show', 'love', 'back', 'cost', 'day', 'made', 'classic', 'money', 'fun', 'dvd', 'view', 'best', 'look', 'think', 'warn', 'deserv', 'check', 'get', 'know', 'though', 'cinema', 'today', 'els', 'away', 'laugh', 'crap', 'thing', 'funni', 'grade', 'audienc', 'come', 'go', 'right', 'say', 'make', 'world', 'minut', 'comedi', 'act', 'great', 'perform', 'score', 'peopl', 'bore', 'scene', 'pictur', 'thank', 'book', 'hour', 'genr', 'either', 'anyon', 'point', 'happen', 'flick', 'night', 'interest', 'screen', 'seri', 'ten', 'experi', 'sequel', 'com', 'us', 'rent', 'kid', 'review', 'version', 'chanc', 'opinion', 'stuff', 'effort', 'b', 'futur', 'age', 'worth', 'aw', 'masterpiec', 'releas', 'wrong', 'gener', 'director', 'live', '

What's the last word in our vocabulary? We can use this to judge if 10000 is too few. If the last word is pretty common, we probably need to keep more words.

## Word to vectors

In [11]:
word2idx = {}
for index, word in enumerate(vocab):
    word2idx[word] = index

In [12]:
print(word2idx)

{'suckotroc': 2108, 'often': 536, 'bronz': 1369, 'africa': 843, 'facet': 2109, 'detroit': 2110, 'person': 269, 'imposs': 844, 'conceit': 2111, 'schlesing': 2112, 'enough': 190, 'eyr': 1039, 'option': 845, 'motorbik': 2113, 'mcdonald': 2114, 'formula': 846, 'smoke': 1040, 'imagin': 235, 'ton': 2117, 'aboutagirli': 4132, 'crap': 57, 'store': 481, 'va': 2118, 'mechan': 1370, 'liner': 1371, 'wait': 537, 'humankind': 2119, 'help': 165, 'implod': 1372, 'klingon': 2121, 'southron': 2122, 'funni': 59, 'venu': 2123, 'hyster': 2124, 'nazism': 2125, 'flashdanc': 4041, 'ice': 1373, 'horobin': 3856, 'franchis': 1374, 'phenomenon': 2126, 'found': 1052, 'remov': 2997, 'vote': 159, 'era': 625, 'consumpt': 2128, 'broadway': 3767, 'household': 3298, 'redifin': 3948, 'though': 51, 'resurrect': 4564, 'tutti': 2131, 'fill': 1377, 'precinct': 2132, 'affect': 861, 'cheadl': 2134, 'massacr': 862, 'fedor': 2136, 'streep': 3464, 'car': 348, 'freedom': 1041, 'peac': 409, 'substanc': 721, 'unrecommend': 3849, 'to

## Text to vector function

In [13]:
def text_to_vector(text):
    to_vector = np.zeros((1, len(word2idx)), dtype=np.int)
    for word in [word for word in word_tokenize(text) if word not in stopwords_col and word not in punctuations]:
        try:
            raw = stemmer.stem(word)
        except:
            raw = word
        if raw in word2idx:
            to_vector[0][word2idx[raw]] += 1
    return to_vector

The result should be returned by this function :
```
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]
                   
array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])
```       

## Turn all of the sentances to Vectors

In [14]:
word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text_to_vector(text[0])

In [15]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

array([[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0],
       [8, 0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 5, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1],
       [8, 0, 6, 4, 4, 0, 2, 0, 0, 0, 1, 4, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 2],
       [0, 1, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

## Train, Validation, Test sets

Split the data into train, validation, and test sets. Using the function `to_categorical` from TFLearn to reshape the target data so that we'll have two output units and can classify with a softmax activation function. 

In [16]:
Y = (labels=='positive').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

## The network

In [18]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, 4858])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU') 
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)
    return model

## Intializing the model

In [19]:
model = build_model()

## Training the network

In [20]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=20)

Training Step: 3179  | total loss: [1m[32m0.21597[0m[0m | time: 1.832s
| SGD | epoch: 020 | loss: 0.21597 - acc: 0.9508 -- iter: 20224/20250
Training Step: 3180  | total loss: [1m[32m0.19808[0m[0m | time: 2.883s
| SGD | epoch: 020 | loss: 0.19808 - acc: 0.9558 | val_loss: 0.39120 - val_acc: 0.8680 -- iter: 20250/20250
--


## Testing
This shows the accuracy of our model.

In [21]:
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.8664


## Test it out!

In [22]:
# Helper function that uses your model to predict sentiment
def test_sentence(sentence):
    positive_prob = model.predict(text_to_vector(sentence))[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [23]:
sentence = "Moonlight is by far the best movie of 2016."
test_sentence(sentence)
text_to_vector(sentence)
sentence = "It's amazing anyone could be talented enough to make something this spectacularly awful"
test_sentence(sentence)

Sentence: Moonlight is by far the best movie of 2016.
P(positive) = 0.918 : Positive
Sentence: It's amazing anyone could be talented enough to make something this spectacularly awful
P(positive) = 0.021 : Negative


In [24]:
test_sentence("This is just a masterpiece")

Sentence: This is just a masterpiece
P(positive) = 0.965 : Positive
