# Basic word2vec

In [2]:
import collections
import math
import os
import random
import zipfile

In [11]:
import numpy as np
from six.moves import urllib
import tensorflow as tf
import requests as req

In [4]:
url = "http://mattmahoney.net/dc/"

In [5]:
def download_data(filename, expected_bytes):
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url+filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print("Found and verified")
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. You can get with browser')
    
    return filename    

In [6]:
filename = 'text8.zip'
#filename, _ = urllib.request.urlretrieve(url+filename, filename)

In [12]:
r = req.get(url + filename)

In [16]:
with open(filename, "wb") as f:
    f.write(r.content)

In [17]:
print(os.stat(filename))

os.stat_result(st_mode=33204, st_ino=657974, st_dev=2054, st_nlink=1, st_uid=1000, st_gid=1000, st_size=31344016, st_atime=1508346887, st_mtime=1508347534, st_ctime=1508347534)


In [18]:
def _build_vocab(filename):
    data = _read_words(filename)
    
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) # sort in descending order, x is a tuple
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    
    return word_to_id    
    

In [66]:
counter = collections.Counter(["the", "that", "man", "woman", "man", "the"])
print(counter)
print(counter.items())
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
print(words)
word_to_id



Counter({'the': 2, 'man': 2, 'woman': 1, 'that': 1})
dict_items([('the', 2), ('woman', 1), ('man', 2), ('that', 1)])
('man', 'the', 'that', 'woman')


{'man': 0, 'that': 2, 'the': 1, 'woman': 3}

In [67]:
def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]    

In [68]:
def ptb_raw_data(data_path=None):
    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")
    
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    
    vocabulary = len(word_to_id)
    
    return train_data, valid_data, test_data, vocabulary  
    
    

In [69]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    with tf.name_scope("PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
        
        data_len = tf.size(raw_data)
        batch_len = data_len//batch_size
        
        data = tf.reshape(raw_data[0:batch_size*batch_len], [batch_size, batch_len])
        epoch_size = [batch_len -1]//num_steps
        assertion = tf.assert_positive(epoch_size, message="epoch_size==0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")
            
        i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
        x = tf.strided_slice(data, [0, i * num_steps], [batch_size, (i + 1) * num_steps])
        x.set_shape([batch_size, num_steps])
        y = tf.strided_slice(data, [0, i+num_steps+1],[batch_size, (i+1)*num_steps+1] )
        y.set_shape([batch_size, num_steps])
        return x, y
        
    
    