In [1]:
import sys,random,math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

In [2]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))
for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated\
        [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
            *iterations)) + "   " + str(similar('terrible')))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
print(similar('terrible'))

Progress:0.99998 [('terrible', -0.0), ('horrible', -2.802722263619964), ('brilliant', -3.097945707650016), ('superb', -3.6827264451457014), ('phenomenal', -3.706623538152234), ('pathetic', -3.7172837123743574), ('mediocre', -3.7860535013316245), ('marvelous', -3.8628950695273767), ('masterful', -3.8884806707913624), ('terrific', -4.110073045362215)])]]])]]]][('terrible', -0.0), ('horrible', -2.787746439265329), ('brilliant', -3.34051980597866), ('pathetic', -3.6690408792709985), ('phenomenal', -3.811194547696435), ('mediocre', -3.8956882035152054), ('superb', -3.971174420579357), ('marvelous', -3.9795421352101563), ('masterful', -4.008705837987641), ('bad', -4.073897490860252)]


In [3]:
def analogy(positive=['terrible','good'],negative=['bad']):
    
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [4]:
analogy(['terrible', 'good'], ['bad'])

[('superb', -221.84212893874007),
 ('terrific', -222.21534759766487),
 ('fine', -222.35045262212336),
 ('decent', -222.39872641755608),
 ('worth', -222.5467360283943),
 ('brilliant', -222.65941405212465),
 ('terrible', -222.69257145226442),
 ('perfect', -222.83482287227974),
 ('nice', -222.91242394247368)]

In [5]:
analogy(['elizabeth','he'],['she'])

[('christopher', -185.1655633742027),
 ('it', -185.62984636734961),
 ('william', -185.6417836325676),
 ('this', -185.83561729020607),
 ('him', -185.86239527975837),
 ('tom', -185.97290706859852),
 ('simon', -185.97848886210375),
 ('david', -185.99281869221406),
 ('she', -186.0404058567483)]