In [1]:
import time

import numpy as np
import tensorflow as tf

import utils

In [2]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'http://mattmahoney.net/dc/text8.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)
        
with open('data/text8') as f:
    text = f.read()

In [3]:
len(text)

100000000

In [4]:
from bs4 import BeautifulSoup
import re
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub(r"[^a-zA-Z.!?]"," ", text)
text = text.lower()
text = re.sub(r"[.]", " <PERIOD> ", text)
text = re.sub(r"[?]", " <QUESTION_MARK> ", text)
text = re.sub(r"[!]", " <EXCLAMATION_MARK> ", text)
words = text.split()

In [5]:
len(words)

17005207

In [6]:
from collections import Counter
word_counts = Counter(words)
words = [word for word in words if word_counts[word] > 5]

In [7]:
len(words)

16680599

In [8]:
import random

threshold = 1e-5
word_counts = Counter(words)
total_count = len(words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
words = [word for word in words if random.random() < (1 - p_drop[word])]

In [9]:
len(words)

4624591

In [10]:
vocab = list(set(words))

In [13]:
len(vocab)

63641

In [12]:
vocab[0:10]

['fond',
 'wing',
 'seasoning',
 'sunspot',
 'unconditional',
 'nearest',
 'glamis',
 'doubts',
 'encarta',
 'brzezinski']

In [14]:
vocab_to_int = {word: ii for ii, word in enumerate(vocab)}
int_to_vocab = {ii: word for ii, word in enumerate(vocab)}

In [15]:
len(vocab_to_int), len(int_to_vocab)

(63641, 63641)

In [16]:
words_in_int = [vocab_to_int[word] for word in words]

In [17]:
words[0:10]

['anarchism',
 'abuse',
 'radicals',
 'diggers',
 'sans',
 'culottes',
 'whilst',
 'pejorative',
 'describe',
 'act']

In [18]:
words_in_int[0:10]

[31758, 26755, 59100, 62169, 62135, 697, 58762, 50597, 26597, 56117]

In [25]:
import random
def extract_features_from_index(idx, window_size=5):
    window_size = random.randrange(1, window_size+1)
    idx_start = idx - window_size if (idx - window_size) >=0 else 0
    idx_end = idx+window_size
    return words_in_int[idx_start: idx] + words_in_int[idx+1: idx_end +1]

In [26]:
def extract_features():
    X = []
    y = []
    for idx, value in enumerate(words_in_int):
        y_idx = extract_features_from_index(idx)
        X.extend([value]*len(y_idx))
        y.extend(y_idx)
    return X, y

In [27]:
X, y = extract_features()

In [28]:
len(X), len(y)

(27748306, 27748306)

In [29]:
X[0:10]

[31758, 31758, 31758, 31758, 26755, 26755, 26755, 26755, 26755, 59100]

In [30]:
y[0:10]

[26755, 59100, 62169, 62135, 31758, 59100, 62169, 62135, 697, 31758]

In [35]:
n_vocab = len(vocab)
n_embedding = 100
n_sampled = 100

In [33]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')

In [34]:
with train_graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)

In [36]:
with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(n_vocab))
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, n_vocab)
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(cost)

In [37]:
with train_graph.as_default():
    print(softmax_w)
    print(softmax_b)
    print(embed)

Tensor("Variable_1/read:0", shape=(63641, 100), dtype=float32)
Tensor("Variable_2/read:0", shape=(63641,), dtype=float32)
Tensor("embedding_lookup:0", shape=(?, 100), dtype=float32)


In [47]:
from sklearn.utils import shuffle
def split_into_batches(X, y, batch_size=128):
    output_batches = []
    sample_size = len(X)
    X_s, y_s = shuffle(X, y)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        yield X_s[start_i:end_i], y_s[start_i:end_i]

In [None]:
epochs = 10
batch_size = 1000
with train_graph.as_default():
    saver = tf.train.Saver()
    with tf.Session(graph=train_graph) as sess:
            sess.run(tf.global_variables_initializer())
            for e in range(epochs):
                loss = 0
                idx = 0
                batches = split_into_batches(X, y, batch_size=batch_size)
                for X_batch, y_batch in batches:
                    feed = {inputs: X_batch, labels: np.array(y)[:, None]}
                    train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
                    loss += train_loss
                    if((idx+1)%100 == 0):
                        print("Epoch: {} Iteration: {} Avg.Training Loss: {:.4f}".
                              formar(e, idx, loss/100)
                             )
                        loss = 0
                    idx +=1
                save_path = saver.save(sess, "checkpoints/text8.ckpt")