In [1]:
import sys

with open('Source/reviews.txt') as r, open('Source/labels.txt') as l:
    raw_reviews = r.readlines()
    raw_labels = l.readlines()
    
 
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab_raw = list(set([word for token in tokens for word in token if len(word) > 0 and word[0].isalpha()]))
vocab = list(set([''.join([w for w in word if w.isalpha()]) for word in vocab_raw]))

word2index = {w:i for i, w in enumerate(vocab)}
input_dataset = [list(set([word2index[word] for word in token if word in word2index])) for token in tokens]
    
target_dataset = [int(label == 'positive\n') for label in raw_labels]

In [2]:
import numpy as np
np.random.seed(1)

sigmoid = lambda x: 1 / (1 + np.exp(-x))
sigder = lambda x: x * (1 - x)
alpha, iterations = 0.01, 2
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = 0, 0
for iter in range(iterations):
    for i in range(len(input_dataset) - 1000):
        x, y = input_dataset[i], target_dataset[i]
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(layer_1 @ weights_1_2)

        layer_2_delta = (np.abs(layer_2)- y)
        layer_1_delta = (layer_2_delta @ weights_1_2.T)

        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha

        if np.abs(layer_2 - y) < 0.5:
            correct += 1
        total += 1

        if i % 10 == 9:
            progres = str(i / (len(input_dataset)) * 100)
            sys.stdout.write('\rIter: ' + str(iter) + \
                             ' progress: ' + str(progres)[:5] + \
                             ' accurancy: ' + str(correct / total))

Iter: 1 progress: 95.99 accurancy: 0.86637583100645977

In [13]:
from collections import Counter
import math


def test(inp):
    inp = ''.join([i for i in inp if i.isalpha() or i == ' '])
    x = list(set([word2index[word] for word in inp.split() if word in vocab]))
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(layer_1 @ weights_1_2)
    return layer_2


def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)

    normed_weights = weights_0_1 * norms
    # normed_weights = weights_0_1**2

    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        scores[word] = -math.sqrt(sum(raw_difference * raw_difference))
    return scores.most_common(10)
    

In [19]:
analogy(['bad'], ['amazing'])

[('worst', -4.4524872396962945),
 ('waste', -4.847852489286),
 ('awful', -4.960861199856841),
 ('poorly', -5.291051918021452),
 ('disappointment', -5.616241126567832),
 ('disappointing', -5.617300091056642),
 ('dull', -5.696591326973348),
 ('boring', -5.706431864718387),
 ('annoying', -5.710001674331351),
 ('terrible', -5.754966477012552)]

In [16]:
test('fallacies')

array([0.55161954])

In [None]:
import sympy as sp
from sympy.abc import x


f = 1 / (1 - x**3)
f.diff()