# 1. [Part I: Build word representation using Skip-Gram model](#1)
# 2. [Part II: Use off-the-shelf Word2Vec package to train](#2)
# 3. [Part III: CNN/RNN - Sentiment Analysis](#3)
# 4. [Part](#4)

In [1]:
import collections
import math
import matplotlib as plt
import os
import random
from tempfile import gettempdir
import zipfile

import numpy as np
import urllib
import tensorflow as tf

<br/>
<br/>
<a id = '1'></a>
# Part I: Build word representation using Skip-Gram model

## Import data as list of string

In [2]:
url = 'http://mattmahoney.net/dc/'
def download(filename):
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,local_filename)
    return local_filename

In [3]:
filename = download('text8.zip')

## Read the data into a list of strings

In [4]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
vocabulary = read_data(filename)
print('Data size', len(vocabulary))

Data size 17005207


In [5]:
vocabulary[:5]

['anarchism', 'originated', 'as', 'a', 'term']

## Build Count, Dictionary, etc.

In [6]:
vocabulary_size = 50000
def build_dataset(words, n_words):
    
    # count --> [['UNK',?], (word1, count1), ....]
    count = [['UNK', -1]] #replace rare words with UNK
    count.extend(collections.Counter(words).most_common(n_words - 1)) 
    
    # dictionary --> {'word1': index1}
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    # data --> [index1, index2, ...]    
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # 'UNK'
            unk_count += 1
        data.append(index)
    
    # count --> [['UNK',count_0], (word1, count1), ....]
    count[0][1] = unk_count
    
    # reversed_dictionary --> [{index1: word1}]
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary

In [8]:
data[:3]

[5234, 3081, 12]

In [9]:
count[:3]

[['UNK', 418391], ('the', 1061396), ('of', 593677)]

In [10]:
dictionary['the']

1

In [11]:
reverse_dictionary[1]

'the'

## Prepare training batch

In [12]:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


In [13]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

In [14]:
print('Batch: ')
print(batch)
print('Labels: ')
print(labels)

Batch: 
[3081 3081   12   12    6    6  195  195]
Labels: 
[[  12]
 [5234]
 [3081]
 [   6]
 [  12]
 [ 195]
 [   6]
 [   2]]


In [15]:
print('Format: index-middle, word-middle, index-left, word-left\n')
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', 
          labels[i, 0], reverse_dictionary[labels[i, 0]])

Format: index-middle, word-middle, index-left, word-left

3081 originated -> 12 as
3081 originated -> 5234 anarchism
12 as -> 3081 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


## Build the skip-gram model.

In [16]:
batch_size = 100
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

In [17]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [18]:
valid_examples

array([69, 29,  5, 64, 90, 42, 20,  1, 79, 49, 97, 35, 92, 50, 11, 27])

### Define Graph - 1, Embedding Mapping

In [19]:
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [20]:
# initialize weight matrix between -1 and 1, size is v_size * embedding sizwe
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

In [21]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
embed # Note, shape = (batch_size, embedding_size), i.e., word vec for all inputs

<tf.Tensor 'embedding_lookup:0' shape=(100, 128) dtype=float32>

### Define Graph - 2, From hidden layer to output

In [22]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean( #mean over batch
      tf.nn.nce_loss(weights = nce_weights,
                     biases = nce_biases,
                     labels = train_labels,
                     inputs = embed,
                     num_sampled = num_sampled,
                     num_classes = vocabulary_size))

In [23]:
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

## After optimization, calculate valid set

In [24]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) # find embedding vec for all valid in
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # calculate distance w/ all words

## Start Session

In [25]:
init = tf.global_variables_initializer()
num_steps = 10001

In [26]:
with tf.Session() as session:

    init.run()
    average_loss = 0
    
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0 and step > 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            final_embeddings = normalized_embeddings.eval()

Average loss at step  0 :  290.665863037
Average loss at step  2000 :  113.576555559
Average loss at step  4000 :  52.9484763718
Average loss at step  6000 :  33.1720424192
Average loss at step  8000 :  23.7593583269
Average loss at step  10000 :  17.7926064858
Nearest to may: besides, gb, microsoft, carbonaceous, relays, orbital, ep, intelligent,
Nearest to or: and, ada, morocco, pursuit, of, a, victoriae, reginae,
Nearest to in: and, of, for, by, from, as, on, to,
Nearest to american: vs, pseudocode, cheese, twelve, crater, livejournal, mike, rand,
Nearest to use: ada, sherlock, reginae, amo, newsgroup, victoriae, fictional, recollection,
Nearest to but: and, otherwise, encampment, vs, alien, economic, predicted, yum,
Nearest to that: ufo, and, phi, this, defined, mosque, glamorous, gland,
Nearest to the: a, gland, his, vs, victoriae, one, coke, analogue,
Nearest to about: file, densities, phi, bckgr, automobile, analogue, psi, modal,
Nearest to had: and, senado, vs, lateral, is, int

<br/>
<br/>
<a id = '2'></a>
# Part II: Use off-the-shelf Word2Vec package to train

https://github.com/rouseguy/DeepLearning-NLP/blob/master/notebooks/2.%20word2vec.ipynb

In [27]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## Import data

In [28]:
caesar_file = './data/juliuscaesar.txt'
stopword_file  = './data/long_stopwords.txt'

## Clean sentence

In [29]:
stop_wordsstop_wo  = []
with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = list(map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp))
stop_words[:5]

['a', 'able', 'about', 'above', 'abst']

In [30]:
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''

In [31]:
print('Raw: ' + r"they'll" + ' --> Cleaned: ' + clean("they'll"))

Raw: they'll --> Cleaned: 


In [32]:
print('Raw: ' + r"King's" + ' --> Cleaned: ' + clean("King's"))

Raw: King's --> Cleaned: kings


## Extract sentence from text file

In [33]:
line_count = 0
sentences = []

            
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = list(filter(lambda x:True if len(x) > 0 else False, words))
            sentences.append(words)

In [34]:
sentences[107:110]

[['second', 'commoner', 'sir', 'wear', 'shoes'],
 [],
 ['work', 'sir', 'holiday']]

## Train a Word2Vec model

In [35]:
# reference: https://radimrehurek.com/gensim/models/word2vec.html
model = Word2Vec (sentences, 
                 window=5, 
                 size=500, 
                 workers=4, 
                 min_count=5)

In [36]:
# Example output:
print('The vector length is: ' + str(len(model.wv['second'])))
print('The vector for word "second" is : ')
print(model.wv['second'][:5])

The vector length is: 500
The vector for word "second" is : 
[ 0.00054155 -0.00027198 -0.00019997  0.00070842 -0.00021237]


In [37]:
model.wv.most_similar(positive=['rome'])

[('brutus', 0.43952152132987976),
 ('caesar', 0.3901950418949127),
 ('citizen', 0.38464778661727905),
 ('time', 0.36275291442871094),
 ('antony', 0.3611658215522766),
 ('thee', 0.3459341526031494),
 ('cassius', 0.3410149812698364),
 ('electronic', 0.3395448327064514),
 ('messala', 0.33680325746536255),
 ('good', 0.3312881886959076)]

<br/>
<br/>
<br/>
<a id = '3'></a>
# Part III: CNN/RNN - Sentiment Analysis

[https://github.com/rouseguy/DeepLearning-NLP/blob/master/notebooks/3.%20CNN%20-%20Text.ipynb](https://github.com/rouseguy/DeepLearning-NLP/blob/master/notebooks/3.%20CNN%20-%20Text.ipynb)

In [60]:
import numpy as np
from script import data_helpers
from script import w2v 
from script.w2v import train_word2vec
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, SpatialDropout1D, Convolution1D, MaxPooling1D, LSTM
from sklearn.cross_validation import train_test_split
np.random.seed(2)

## Train a Word2Vec model to get the embedding vectors

In [3]:
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data() # Note, x is padded with zero in the end
print("Loading finished...")
print('There is a total of ' + str(len(vocabulary)) + ' words in vocabulary')
print('The shape of X is: ' + str(x.shape)) # 10662 sequences, every sequence has 56 words
print('The shape of Y is: ' + str(y.shape)) # 10662 results with either [1,0] - positive or [0,1] - negative

Loading data...
Loading finished...
There is a total of 18779 words in vocabulary
The shape of X is: (10662, 56)
The shape of Y is: (10662, 2)


In [4]:
# Model Hyperparameters
sequence_length = 56
embedding_dim = 20          
num_filters = 150
filter_size = 3
dropout_prob = 0.25

# Training parameters
batch_size = 32
num_epochs = 2

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count                        
context = 10        # Context window size

In [6]:
# train_word2vec
embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context)

Loading existing Word2Vec model '20features_1minwords_10context'


In [7]:
embedding_weights[0].shape

(18779, 20)

In [8]:
embedding_weights

[array([[-0.11376537, -0.13623959, -0.17433217, ...,  0.34611851,
         -0.19512145, -0.14178257],
        [ 0.0526082 , -0.07634247, -0.20783381, ...,  0.39987352,
          0.00308891, -0.22333454],
        [-0.01896649, -0.23291215, -0.18632506, ...,  0.21834175,
          0.04105491, -0.16411212],
        ..., 
        [ 0.02709346, -0.28432941, -0.29434878, ...,  0.40582779,
          0.07189913, -0.19080783],
        [ 0.12260104, -0.37218949, -0.11956801, ...,  0.16573152,
          0.01496829, -0.33405513],
        [ 0.02421027, -0.18543144, -0.29883066, ...,  0.07891279,
          0.08901211, -0.1006508 ]], dtype=float32)]

## Generate train/test set

In [10]:
data = np.append(x,y,axis = 1)
train, test = train_test_split(data, test_size = 0.15,random_state = 0)
X_test = test[:,:-2]
Y_test = test[:,-2:]
X_train = train[:,:-2]
Y_train = train[:,-2:]

In [11]:
X_train.shape

(9062, 56)

## Model training - 1, using pretrained embedding, and 1-d CNN

In [79]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), 
                    output_dim = embedding_dim, 
                    input_length = sequence_length,
                    weights = embedding_weights))

model.add(Convolution1D(filters = num_filters,
                         kernel_size = filter_size,
                         strides = 1,
                         padding = 'valid',
                         activation = 'relu'))

model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(dropout_prob))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 56, 20)            375580    
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 54, 150)           9150      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 27, 150)           0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 27, 150)           0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 4050)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 8102      
Total params: 392,832
Trainable params: 392,832
Non-trainable params: 0
_________________________________________________________________


![image](https://cdn-images-1.medium.com/max/1200/1*h_L7fSoQhipTHFULgXmHyQ.png)

In [80]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=5)

score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 9062 samples, validate on 1600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.538783634305
Test accuracy: 0.744375


## Model prediction

In [32]:
preds = model.predict_classes(X_test)

In [45]:
for _ in range(5):
    sentence = '\n'
    for num in X_test[_]:
        word = vocabulary_inv[num]
        if word != '<PAD/>':
            sentence = sentence + word + ' '
    pred = preds[_]
    label = 0 if Y_test[_][0] == 1 else 1
    print(sentence + '--> ' + str(pred) + ' --> ' + str(label))


serry wants to blend politics and drama , an admirable ambition it 's too bad that the helping hand he uses to stir his ingredients is also a heavy one --> 0 --> 0

an amateurish , quasi improvised acting exercise shot on ugly digital video --> 0 --> 0

playing a role of almost bergmanesque intensity bisset is both convincing and radiant --> 1 --> 1

no big whoop , nothing new to see , zero thrills , too many flashbacks and a choppy ending make for a bad film --> 0 --> 0

please , someone , stop eric schaeffer before he makes another film --> 0 --> 0


## Model training - 2, LSTM

In [106]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), 
                    output_dim = embedding_dim, 
                    input_length = sequence_length,               
                    mask_zero = True, #https://keras.io/layers/embeddings/
                    weights = embedding_weights))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(units = 120, 
               dropout = 0.2,  
               recurrent_dropout = 0.2,
               return_sequences = False)) # (Batch size, time steps, units) - with return_sequences=True

model.add(Dense(2, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 56, 20)            375580    
_________________________________________________________________
spatial_dropout1d_15 (Spatia (None, 56, 20)            0         
_________________________________________________________________
lstm_18 (LSTM)               (None, 120)               67680     
_________________________________________________________________
dense_27 (Dense)             (None, 2)                 242       
Total params: 443,502
Trainable params: 443,502
Non-trainable params: 0
_________________________________________________________________


![image](https://d3ansictanv2wj.cloudfront.net/SentimentAnalysis16-38b6f3cbb7bae622fe0ba114db188666.png)

In [107]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=5)

score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 9062 samples, validate on 1600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.506822817326
Test accuracy: 0.754375
