# 1. Sentiment analysis

Using the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), we want to do a regression model that predict the ratings are on a 1-10 scale. You have an example train and test set in the `dataset` folder.

### 1.1 Regression Model

Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

### 1.2 RNN model

Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [37]:
import pandas as pd
import spacy
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.metrics import r2_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU
from tensorflow.keras import optimizers
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
import random

In [2]:
# 1.1 Regression Model

train = pd.read_csv('dataset/example_train_imdb_reviews.csv')
test = pd.read_csv('dataset/example_test_imdb_reviews.csv')

In [3]:
# https://spacy.io/models/en
#!python -m spacy download en_core_web_sm

In [4]:
nlp = spacy.load('en_core_web_sm')

In [14]:
#Tokenize train and test
def tokenize(text, encoder):
    '''Split texts into lists of words (tokens)'''
    tokens = [[word.lower_ for word in encoder(words)] for words in text]
    return tokens

train['Tokenized Review'] = tokenize(train['Review'], nlp)
test['Tokenized Review'] = tokenize(test['Review'], nlp)

In [6]:
#Taken from lecture

def make_lexicon(token_seqs, min_freq=1, use_padding=False):
    # First, count how often each word appears in the text.
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1
    # Then, assign each word to a numerical index. 
    # Filter words that occur less than min_freq times.
    lexicon = [token for token, count in token_counts.items() if count >= min_freq]
    # Indices start at 2. 0 is reserved for padding, and 1 for unknown words.
    lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
    lexicon[u'<UNK>'] = 1 # UNK are those that occur < min_freq
    lexicon_size = len(lexicon)

    print("LEXICON SAMPLE ({} total items):".format(len(lexicon)))
    print(dict(list(lexicon.items())[:20]))
    
    return lexicon

In [7]:
lexicon = make_lexicon(token_seqs=train['Tokenized Review'], min_freq=1)

LEXICON SAMPLE (2629 total items):
{'this': 2, 'movie': 3, 'only': 4, 'gets': 5, 'a': 6, 'second': 7, 'star': 8, 'because': 9, 'i': 10, 'work': 11, 'downtown': 12, 'and': 13, 'liked': 14, 'seeing': 15, 'it': 16, 'destroyed': 17, '.': 18, 'the': 19, 'effects': 20, 'were': 21}


In [17]:
#From class
def tokens_to_idxs(token_seqs, lexicon):
    idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for token in token_seq] for token_seq in token_seqs]
    return idx_seqs

train['idxs'] = tokens_to_idxs(train['Tokenized Review'], lexicon)
test['idxs'] = tokens_to_idxs(test['Tokenized Review'], lexicon)

train

Unnamed: 0,Rating,Review,Tokenized Review,idxs
0,2,this movie only gets a second star because i w...,"[this, movie, only, gets, a, second, star, bec...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,8,"As I watched this movie, and I began to see it...","[as, i, watched, this, movie, ,, and, i, began...","[112, 10, 113, 2, 3, 51, 13, 10, 114, 74, 115,..."
2,4,this seemed an odd combination of Withnail and...,"[this, seemed, an, odd, combination, of, withn...","[2, 169, 124, 170, 171, 39, 172, 13, 10, 173, ..."
3,9,When I saw the Exterminators of year 3000 at f...,"[when, i, saw, the, exterminators, of, year, 3...","[127, 10, 200, 19, 201, 39, 202, 203, 166, 204..."
4,9,"This is a very entertaining flick, considering...","[this, is, a, very, entertaining, flick, ,, co...","[2, 77, 6, 137, 263, 266, 51, 267, 19, 268, 13..."
...,...,...,...,...
95,2,Oh my. I decided to go out to the cinemas with...,"[oh, my, ., i, decided, to, go, out, to, the, ...","[821, 295, 18, 10, 914, 74, 1194, 335, 74, 19,..."
96,7,It appears even the director doesn't like this...,"[it, appears, even, the, director, does, n't, ...","[16, 2566, 618, 19, 706, 498, 44, 385, 2, 29, ..."
97,9,The thing I remember most about this film is t...,"[the, thing, i, remember, most, about, this, f...","[19, 596, 10, 650, 26, 48, 2, 29, 77, 55, 16, ..."
98,7,I recently saw I.Q. and even though I'm not a ...,"[i, recently, saw, i.q., and, even, though, i,...","[10, 655, 200, 2599, 13, 618, 186, 10, 744, 69..."


In [9]:
def idx_seqs_to_bows(idx_seqs, matrix_length):
    bow_seqs = np.array([np.bincount(np.array(idx_seq), minlength=matrix_length) 
                            for idx_seq in idx_seqs])
    return bow_seqs

In [19]:
bow_train = idx_seqs_to_bows(train['idxs'], matrix_length=len(lexicon) + 1) # +1 for padding length
bow_test = idx_seqs_to_bows(test['idxs'], matrix_length=len(lexicon) + 1)
bow_train

array([[0, 0, 4, ..., 0, 0, 0],
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 4, ..., 1, 1, 1]], dtype=int64)

In [11]:
def FFNN_model(n_input_nodes, n_hidden_nodes):
    input_layer = Input(shape=(n_input_nodes,))
    hidden_layer = Dense(units=n_hidden_nodes, activation='sigmoid')(input_layer)
    output_layer = Dense(units=1)(hidden_layer)
    
    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=[input_layer], outputs=output_layer)
    model.compile(loss="mean_squared_error", optimizer='adam')
    
    return model

reg = FFNN_model(n_input_nodes=len(lexicon) + 1, n_hidden_nodes=500)

In [13]:
reg.fit(x=bow_train, y=train['Rating'], batch_size=20, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x22d15e42130>

In [22]:
test['pred'] = np.round(reg.predict(bow_test)[:,0]).astype(int) #Round predictions to nearest integer
r2 = r2_score(y_true=test['Rating'], y_pred=test['pred'])
r2 #so bad......

-0.08919278712332068

In [24]:
#1.2 RNN model
#From class
def pad_idx_seqs(idx_seqs):
    max_seq_len = max([len(idx_seq) for idx_seq in idx_seqs]) 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len) 
    return padded_idxs

train_padded_idxs = pad_idx_seqs(train['idxs'])

In [28]:
def model_RNN(n_input_nodes, n_embedding_nodes, n_hidden_nodes):
    input_layer = Input(shape=(None,))
    embedding_layer = Embedding(input_dim=n_input_nodes,
                                output_dim=n_embedding_nodes,
                                mask_zero=True)(input_layer) 
    
    gru_layer = GRU(units=n_hidden_nodes)(embedding_layer)
    output_layer = Dense(units=1)(gru_layer)

    model = Model(inputs=[input_layer], outputs=output_layer)
    model.compile(loss="mean_squared_error", optimizer='adam')
    
    return model

In [31]:
rnn_model = model_RNN(n_input_nodes=len(lexicon) + 1, n_embedding_nodes=300, n_hidden_nodes=500)

In [34]:
# Train the model
rnn_model.fit(x=train_padded_idxs, y=train['Rating'], batch_size=20, epochs=5)

# Put test reviews in padded matrix
test['Review_Idxs'] = tokens_to_idxs(token_seqs=test['Tokenized Review'],
                                             lexicon=lexicon)
test_padded_idxs = pad_idx_seqs(test['idxs'])

test['RNN_Pred'] = np.round(rnn_model.predict(test_padded_idxs)[:,0]).astype(int)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
# Evaluate the model with R^2
r2 = r2_score(y_true=test['Rating'], y_pred=test['RNN_Pred'])
print(r2)

-0.36940392295590163


# 2. (evil) XOR Problem

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:

### 2.1 

Generate a dataset of random <=100,000 binary strings of equal length <= 50. Train the LSTM; what is the maximum length you can train up to with precisison?
    

### 2.2

Generate a dataset of random <=200,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?


In [None]:
# https://github.com/mitchellvitez/lstm-xor/blob/master/lstm_xor.py

In [38]:
# 2.1
SEQ_LEN = 50
COUNT = 100000
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])
print('shape check:', training.shape, '=', target.shape)

shape check: (100000, 50, 2) = (100000, 50, 2)


In [39]:
model = Sequential()
model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training, target, epochs=10, batch_size=128)
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 1)             16        
_________________________________________________________________
dense_3 (Dense)              (None, 50, 2)             4         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [40]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]

In [41]:
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 1 0
 0 0 1 0 0 0 1 1 0 0 1 0 0]
prediction: 0
confidence: 50.20%
actual: 1


In [42]:
#2.2
SEQ_LEN = 50
COUNT = 100000
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])
print('shape check:', training.shape, '=', target.shape)

shape check: (100000, 50, 2) = (100000, 50, 2)


In [43]:
model = Sequential()
model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training, target, epochs=10, batch_size=128)
model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50, 1)             16        
_________________________________________________________________
dense_4 (Dense)              (None, 50, 2)             4         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [44]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]

In [45]:
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
prediction: 1
confidence: 99.48%
actual: 1
