In [None]:
import nltk # install nltk by sudo pip -U nltk. nltk is a package for natural language processing.  
# nltk.download() # run download 
import numpy as np
import re
import tensorflow as tf

In [None]:
# the speech.txt contains Mr. Trump's speeches.
with open('speech.txt', 'r') as content_file:
    corpus_raw = content_file.read()

In [None]:
# convert to lower case
corpus_raw = corpus_raw.lower()

# some preprocessing on the corpus
corpus_raw = corpus_raw.replace('.',' . ')
corpus_raw = corpus_raw.replace('?',' . ')
corpus_raw = corpus_raw.replace(',',' ')
corpus_raw = corpus_raw.replace('\'s','')
corpus_raw = corpus_raw.replace('\'',' ')
corpus_raw = corpus_raw.replace(';',' . ')
# extracting words 
words = []
for word in corpus_raw.split():
    if word != '.': # because we don't want to treat . as a word
        words.append(word)

In [None]:
word_tag = nltk.pos_tag(words)
words_status = np.zeros(len(words))
word2stat = {}
for (word, stat) in word_tag: 
    if (not re.match('.*\d+', word)) and (stat == 'NN' or stat == 'NNS' or stat == 'JJ') and len(word)>1: 
        word2stat[word] = 1
    else: 
        word2stat[word] = 0
word_sequence = [word for word in words if word2stat[word] == 1]
words = set(word_sequence)
# indexing the words 
word2int = {}
int2word = {}
vocab_size = len(words) # gives the total number of unique words
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word
index_sequence = [None]*(len(word_sequence))
for i in range(len(word_sequence)): 
    index_sequence[i] = word2int[word_sequence[i]]

In [None]:
#Get a list of indeces in a window around an index.
def get_target(words, idx, window_size=5):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)    

#Create a generator of word batches as a tuple (inputs, targets) 
def get_batches(words, batch_size, window_size=5):
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y
    

In [None]:
# embedding probablisitc model 
D = 100 # embedding dimension
X = tf.placeholder(tf.int32,[None])
Y = tf.placeholder(tf.int32,[None,None])
embedding = tf.Variable(tf.random_uniform((vocab_size,D),-1,1)) # word embeddings 
embed = tf.nn.embedding_lookup(embedding, X) # use tf.nn.embedding_lookup to get the hidden layer output
# solution a. 
softmax_w = tf.Variable(tf.truncated_normal((vocab_size, D)))
softmax_b = tf.Variable(tf.zeros(vocab_size))
batch_size = 300
loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w,
        biases=softmax_b,
        labels=Y,
        inputs=embed,
        num_sampled=batch_size,
        num_classes=vocab_size)
rloss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer().minimize(rloss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
# training 
T = 10 
epochs = 20
window_size = 7
for i in range(epochs):
#     print('---')
    closs = 0 
    batches = get_batches(index_sequence, batch_size, window_size)
    num = 0 
    for x, y in batches:
        feed = {X: x,
                Y: np.array(y)[:, None]}
        # solution b. 
        train_loss, _ = sess.run([rloss, optimizer], feed_dict=feed)
        num +=1
        closs += train_loss
    closs = closs/float(num)
    print('epoch={},loss={}'.format(i,closs))

In [None]:
# scatter plot the words
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE # for dimensinality reduction
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
normalized_embedding = embedding / norm
    
embed_mat = sess.run(normalized_embedding)
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[500:1000, :]) # embedded words in 2D space

fig, ax = plt.subplots(figsize=(20, 20))
for idx in range(embed_tsne.shape[0]):
    plt.scatter(embed_tsne[idx, 0],embed_tsne[idx, 1], color='steelblue')
    plt.annotate(int2word[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)