In [1]:
import numpy as np
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
import re
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stop_words = set(stopwords.words('english'))

In [4]:
def tokenize(corpus):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus_tokenized = tokenizer.texts_to_sequences(corpus)
    # print(tokenizer.sequences_to_text(corpus_tokenized))
    # print(tokenizer.word_index)
    V = len(tokenizer.word_index)
    return tokenizer.word_index, corpus_tokenized, V


def initialize(V, N):
    np.random.seed(100)
    W1 = np.random.rand(V, N)
    W2 = np.random.rand(N, V)

    return W1, W2


def to_categorical(y, num_classes=None):
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    # print(categorical)
    return categorical


def corpus2ContextnCenter(corpus_tokenized, V, ws):
    for words in corpus_tokenized:
        L = len(words)
        # print(L)
        for index, word in enumerate(words):
            contexts = []
            center = []
            for i in range(index - ws, index + ws + 1):
              if 0 <= i < L and i != index:
                contexts = contexts + [words[i]-1]
            center.append(word-1)
            # x has shape c x V where c is size of contexts
            x = to_categorical(contexts, V)
            # y has shape k x V where k is number of center words
            y = to_categorical(center, V)
            yield (x, y)


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [16]:
settings = {
    'ws': 2,
    'n': 10,
    'epochs': 2,
    'lr': 0.01
}

In [6]:
class Word2Vec:
  
    def __init__(self, corpus=''):
        self.window = settings['ws']
        self.N = settings['n']
        self.corpus = corpus
        self.eta = settings['lr']

    
    def skipgram(self, context, x, W1, W2, loss):
        h = np.matmul(W1.T, x.T)
        u = np.dot(W2.T, h)
        y_pred = softmax(u)

        e = np.outer(y_pred,np.array([1]*context.shape[0]))-context.T

        dW2 = np.outer(h, np.sum(e, axis=1))
        dW1 = np.outer(x, np.dot(W2, np.sum(e, axis=1)))

        new_W1 = W1 - settings['lr'] * dW1
        new_W2 = W2 - settings['lr'] * dW2

        loss += - np.sum([u[label.T == 1] for label in context]) + len(context) * np.log(np.sum(np.exp(u)))

        return new_W1, new_W2, loss


    def run(self, corpus_tokenized, V):
        # corpus_tokenized, V = tokenize(self.corpus)
        # print(corpus_tokenized)
        # print(V)
        W1, W2 = initialize(V, self.N)

        loss_vs_epoch = []
        for e in range(settings['epochs']):
            loss = 0.
            for context, center in corpus2ContextnCenter(corpus_tokenized, V, self.window):
                W1, W2, loss = self.skipgram(context, center, W1, W2, loss)
            loss_vs_epoch.append(loss)

        return W1, W2

In [7]:
def predict(x, W1, W2):
        h = np.mean([np.matmul(W1.T, xx) for xx in x], axis=0)
        u = np.dot(W2.T, h)
        return softmax(u)

def cosine_distance(word, V, W1, myVocab):
  # print(word)
  # word = float(word)
  cosine_d = {}
  for i in range(V):
    vocab_sc = W1[i]
    # vocab_sc = vocab_sc.astype(float)
    # word = word.astype(float)
    a = np.dot(word, vocab_sc)
    b = np.linalg.norm(word) * np.linalg.norm(vocab_sc)
    # word = map(lambda x: float(x), word)
    # vocab_sc = map(lambda x: float(x), vocab_sc)
    theta = a/b
    # print(V)
    # print(theta)
    key_word = ''
    for key, value in myVocab.items():
      if i == value:
        key_word = key
        break
    
    cosine_d[key_word] = theta

    # print(cosine_d)
  sorted_list = sorted(cosine_d.items(), key=lambda kv: kv[1], reverse=True)
  for key, sim in sorted_list[1:11]:
    print("{} : {}".format(key, sim))


In [20]:
def main():
    # corpus = "We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the "trucker" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that's just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!I'm a professional OTR truck driver, and I bought a TND 700 at a truck stop hoping to make my life easier.  Rand McNally, are you listening?First thing I did after charging it was connect it to my laptop and install the software and then attempt to update it.  The software detected a problem with my update and wanted my home address so I could be sent a patch on an SD card.  Hello?  I don't think I'm all that unusual; my home address is a PO box that a friend checks weekly and that I might get to check every six months or so.  I live in my truck and at truck stops.  If you need to make a patch available on an SD card then you should send the SD cards to the truck stops where the devices are sold.  I ran the update program multiple times until the program said that the TND 700 was completely updated.I programmed in the height (13'6"), the length (53') and the weight (80,000#) of my rig and told it that I preferred highways.  I was parked at a truck stop in the Cincinnati OH area.  My next pickup was about 15 miles down the same freeway but on the other side of it a couple of blocks.  My cell phone GPS (Sprint) said to get on the freeway to get to my pickup.  The TND 700 routed me thru 23 miles of residential streets before finally getting me to my pickup.  Very exciting, especially since every time I refused to turn down a street posted "No Trucks" the TND 700 took almost 5 minutes to figure a re-route, and it happened multiple times on that short trip.I decided to give it another chance.  After my pickup on the north side of Cincinnati just off of I-75 I needed to head to Phoenix AZ via I-71.  Easy route is to just hop on I-75 and drive west and south to the intersection of I-71.  Indeed, that is what my cell phone advised.  The TND 700, however, wanted to route me over surface streets across the city and pick up I-75 on the other side of the city.  I turned it off and the next time I passed a truck stop of the same chain I purchased it at I returned it and got my money back.I then spent $30 on a cheap printer.  Now I take a minute to set up my route on Google and print it out.  Hasn't gotten me lost yet over several cross country trips."
    import json
    # from google.colab import drive
    # drive.mount('/content/drive')

    fileIn = open('/content/drive/My Drive/Dataset/reviews_Electronics_5.json')
    fileOut1 = open('/content/drive/My Drive/Dataset/weights1skipgram.txt', 'w+')
    fileOut2 = open('/content/drive/My Drive/Dataset/weights2skipgram.txt', 'w+')
    lines = fileIn.readlines()

    corpus = ''
    numWords = 0
    for line in lines:
      if numWords < 50000:
        data = json.loads(line)
        for val in data['reviewText'].split('.'):
          sent = re.findall("[A-Za-z]+", val)
          line = ''
          for words in sent:
            if len(words) > 1 and words not in stop_words:
              line += ' ' + words
              numWords += 1
          corpus += line
      # # corpus = corpus+data['reviewText']

    # # print(corpus)
    # corpus = "I like playing football with my friends"
    
    w2v = Word2Vec(corpus=corpus)

    # print(w2v.corpus)

    myVocab, corpus_tokenized, V = tokenize([corpus])
    # print(corpus_tokenized)
    # print(V)
    # W1, W2 = initialize(V, settings['n'])
    vocab_words = list(myVocab.keys())

    W1, W2 = w2v.run(corpus_tokenized, V)

    fileOut1.write(str(W1))
    fileOut2.write(str(W2))

    # print(W1)
    # print(W2)
    # print(myVocab)
    word = "camera"
    # print(myVocab[word])
    index = myVocab[word]
    x = np.zeros(V, dtype='int')
    x[index-1] = 1
    word_vector = W1[index-1]
    # print(word_vector)
    y_pred = predict([x], W1, W2)
    print(y_pred)

    cosine_distance(word_vector, V, W1, myVocab)

    word = "great"
    # print(myVocab[word])
    index = myVocab[word]
    x = np.zeros(V, dtype='int')
    x[index-1] = 1
    word_vector = W1[index-1]
    # print(word_vector)
    y_pred = predict([x], W1, W2)
    print(y_pred)

    word = "trucker"
    # print(myVocab[word])
    index = myVocab[word]
    x = np.zeros(V, dtype='int')
    x[index-1] = 1
    word_vector = W1[index-1]
    # print(word_vector)
    y_pred = predict([x], W1, W2)
    print(y_pred)

    word = "working"
    # print(myVocab[word])
    index = myVocab[word]
    x = np.zeros(V, dtype='int')
    x[index-1] = 1
    word_vector = W1[index-1]
    # print(word_vector)
    y_pred = predict([x], W1, W2)
    print(y_pred)

    word = "tripod"
    # print(myVocab[word])
    index = myVocab[word]
    x = np.zeros(V, dtype='int')
    x[index-1] = 1
    word_vector = W1[index-1]
    # print(word_vector)
    y_pred = predict([x], W1, W2)
    print(y_pred)
    

    # print(y_pred)

    # print(loss_vs_epoch)

In [18]:
main()

[1.58324590e-02 5.48189239e-03 1.55487489e-02 ... 4.72645341e-05
 4.48262438e-05 6.45275033e-05]
even : 0.9948375000603757
ed : 0.9855832933897262
player : 0.9847006231262169
volume : 0.9822266652058244
core : 0.9817757948541198
quirks : 0.9817535035112914
end : 0.9812382908058027
flipping : 0.9793805854919142
life : 0.9762058863638781
brave : 0.9755448026073842
[4.84651551e-02 1.28742394e-02 2.52897338e-02 ... 4.75477267e-05
 2.72553561e-05 3.35519070e-05]
[2.13710274e-02 5.88833156e-03 1.90667475e-02 ... 4.75755026e-05
 5.82023025e-05 2.84846419e-05]
[1.93926965e-02 7.77952580e-03 9.61556897e-03 ... 5.15534149e-05
 5.44087681e-05 7.67880575e-05]


KeyError: ignored