In [1]:
import sys, random, math
from collections import Counter
import numpy as np
from nltk.corpus.reader import reviews

np.random.seed(2)
random.seed(2)
with open('Source/reviews.txt') as f:
    raw_reviews = f.readlines()
    
tokens = list(map(lambda x: x.split(), raw_reviews))
wordcnt = Counter()
for token in tokens:
    for word in token:
        wordcnt[word] -= 1
        
vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

word2index = {word: i for i, word in enumerate(vocab)}
concatenated = np.array([word2index[word] for token in tokens for word in token if word in word2index])

input_dataset = []
for token in tokens:
    sent = []
    for word in token:
        sent.append(word2index[word]) if word in word2index else input_dataset
        
    input_dataset.append(sent)


In [30]:
def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

sigmoid = lambda x: 1 / (1 + np.exp(-x))

In [3]:
random.shuffle(input_dataset)
alpha, iterations = 0.05, 2
hidden_size, window, negative = 50, 2, 5

weights_0_1 = np.random.randn(len(vocab), hidden_size) * 0.2 - 0.1
weights_1_2 = np.zeros((len(vocab), hidden_size))

layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]] + list(concatenated\
            [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0, target_i-window):target_i]
        right_context = review[target_i+1:min(len(review), target_i+window)]

        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1 @ weights_1_2[target_samples].T)
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta @ weights_1_2[target_samples]
        
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                                                       *iterations)) + "   " + str(similar('the')))
        
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                                                   *iterations)))

Progress:0.99998 [('the', -0.0), ('tier', -5.299581307368722), ('angelopoulos', -5.363384988570741), ('esquire', -5.37014584580258), ('our', -5.53100806836489), ('balding', -5.6786569415731885), ('malfunctions', -5.691637090616645), ('splashy', -5.703941309521132), ('yay', -5.748181578270544), ('sized', -5.7571611261994)]6782)]73)]859145)]7)]

In [None]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
norms.resize(norms.shape[0], 1)

normed_weights = weights_0_1 * norms

def analogy(positive=['terrible', 'good'], negative=['bad']):
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
        
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        scores[word] = -math.sqrt(sum(raw_difference * raw_difference))
    return scores.most_common(10)

def make_sent_vect(words):
    indices = list(map(lambda x: word2index[x], filter(lambda x: x in vocab, words)))
    return np.mean(normed_weights[indices], axis=0)

# reviews2vectors = np.array([make_sent_vect(x) for x in tokens])

def most_similar_reviews(review):
    vec = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors @ vec):
        scores[i] = val
    return [raw_reviews[i][:40] + ' ' + score for i, score in scores.most_common(3)]

In [None]:
reviews2vectors

20630