In [1]:
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
import lxml.etree

In [2]:
if not os.path.isfile('../1_preprocessing/ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="../1_preprocessing/ted_en-20160408.zip")

In [3]:
with zipfile.ZipFile('../1_preprocessing/ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_keywords = doc.xpath('//keywords/text()')
input_contents = doc.xpath('//content/text()')
del doc

def makeLabel(ks):
    return (("T" if "technology" in ks else "o") + 
    ("E" if "entertainment" in ks else "o") +
    ("D" if "design" in ks else "o"))

contents = []
keywords = []
for (d,k) in zip(input_contents, input_keywords):
    # Remove everything in parens
    c = re.sub(r'\([^)]*\)', '', d)
    # Remove line breaks and "foo: " prefixes
    c = re.sub(r'\n([^:]{,20}:)?', ' ', c)
    # Lowercase, remove special chars
    c = re.sub(r'[^a-z0-0\.]+', ' ', c.lower())
    sentences = [s.strip() for s in c.split('.') if len(s.strip()) > 0]
    if len(sentences) > 0:
        contents.append(sentences)
        keywords.append(makeLabel(k))
del input_contents
del input_keywords

In [4]:
embedding = "glove"
vocab = {}
vocab_dim = 0

if embedding == "word2vec":
    from gensim.models import Word2Vec
    vocab_dim = 100
    vocab = Word2Vec([sentence for content in contents for sentence in content], 
                     size=100, window=5, workers=4)
else:
    vocab_dim = 50
    if not os.path.isfile('glove.6B.zip'):
        urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", filename="glove.6B.zip")
    with zipfile.ZipFile('glove.6B.zip', 'r') as z:
        fin = z.open('glove.6B.50d.txt', 'r')
        for line in fin:
            items = line.decode("utf-8").strip().split(' ')
            assert len(items) == 51
            word = items[0]
            vect = np.array([float(i) for i in items[1:]])
            vocab[word] = vect

In [5]:
content_vectors = []
for content in contents:
    sum_, count = np.zeros(vocab_dim), 0
    for sentence in content:
        for word in sentence:
            try:
                count += 1
                sum_ += vocab[word]
            except KeyError:
                pass
    content_vectors.append(sum_ / count)

    
one_hot = ["ooo", "Too", "oEo", "ooD", "TEo", "ToD", "oED", "TED"]
key_vectors = []
for keyword in keywords:
    vec = np.zeros(8)
    vec[one_hot.index(keyword)] = 1
    key_vectors.append(vec)

In [6]:
import tensorflow as tf

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.5)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [7]:
hidden_layer_size = 20

x = tf.placeholder(tf.float32, shape=[None, vocab_dim])
y = tf.placeholder(tf.float32, shape=[None, 8])

W = weight_variable([vocab_dim, hidden_layer_size])
b = bias_variable([hidden_layer_size])
h = tf.tanh(tf.matmul(x, W) + b)

V = weight_variable([hidden_layer_size, 8])
c = bias_variable([8])
u = tf.matmul(h, V) + c

In [8]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(u, y))

correct_prediction = tf.equal(tf.argmax(u, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

train_step = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cross_entropy)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for i in range(0, 1585, 50):
    sess.run(train_step, feed_dict={x: content_vectors[:1585][i:i+50], 
                                    y: key_vectors[:1585][i:i+50]})

In [9]:
print ('Training accuracy: %g' % sess.run(accuracy, feed_dict={
            x: content_vectors[:1585], y: key_vectors[:1585]}))
print ('Test accuracy: %g' % sess.run(accuracy, feed_dict={
            x: content_vectors[-250:], y: key_vectors[-250:]}))

Training accuracy: 0.0107256
Test accuracy: 0.044
