In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
import zipfile
from sklearn.preprocessing import LabelEncoder
W_EMB_SZ = 50
NEWS_FT = 'news_words'
ZIP_FILE = 'data/NewsAggregatorDataset.zip'
DIR_TO_EXTRACT = 'data/'


#Extract the Data
zip_file_ref = zipfile.ZipFile(ZIP_FILE, 'r')
zip_file_ref.extractall(DIR_TO_EXTRACT)
zip_file_ref.close()

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
news_df = pd.read_csv('data/newsCorpora.csv',delimiter='\t', header=None, 
                      names=['ID','TITLE','URL','PUBLISHER','CATEGORY','STORY','HOSTNAME','TIMESTAMP'])

In [3]:
news_df = news_df.sample(frac=1.0)
news_df.head(5)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
135806,136142,Interest rates will stay rock bottom as long a...,http://www.inman.com/2014/04/18/interest-rates...,Inman.com,b,d_VVtjgpgeVnjYMXi6xcS7sGLrZiM,www.inman.com,1397885061309
319431,319891,Del Rey single again,http://www.carlow-nationalist.ie/2014/06/24/de...,The Nationalist,e,dJD9KwQaNI-FLEM3Fv7dfAMpLRzFM,www.carlow-nationalist.ie,1403795313486
108538,108735,VIDEO: Late-night laughs: Ukraine edition,http://news.ftcpublications.com/2014/04/video-...,FTC Publications,e,dnYNH7DEqNtjdHMtz3zaqIgeiKc1M,news.ftcpublications.com,1397404827974
91471,91547,"As Lime Prices Rise, Metro Phoenix Restaurants...",http://blogs.phoenixnewtimes.com/bella/2014/04...,Phoenix New Times \(blog\),b,ddP_OW4ua21zq4MMIct4QiubD95dM,blogs.phoenixnewtimes.com,1396988470112
408398,408917,Guardians of cinema (sort of),http://www.dailyiowan.com/2014/07/31/Arts/3854...,UI The Daily Iowan,e,dRKkK4WvOrveL8MhZkIl6QVHX805M,www.dailyiowan.com,1406875704667


In [4]:
average_title_sz = int(sum([len(c) for c in news_df.TITLE])/news_df.shape[0])

In [5]:
lencoder = LabelEncoder()
voc_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(average_title_sz)
X_transform = voc_processor.fit_transform(news_df.TITLE)
X_transform = np.array(list(X_transform))
y = lencoder.fit_transform(news_df.CATEGORY.values)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_transform, 
                                    y, test_size=0.2, random_state=42)
n_words = len(voc_processor.vocabulary_)
n_classes = len(lencoder.classes_)

In [6]:
voc_dict = voc_processor.vocabulary_._mapping
sorted_vocab = sorted(voc_dict.items(), key = lambda x : x[1])
f = open('/tmp/meta.tsv', 'w')
for val in sorted_vocab:
    f.write(str(val[0]) + "\n")
f.close()

In [7]:
def get_estimator_spec(input_logits, out_lb, train_predict_m):
    preds_cls = tf.argmax(input_logits, 1)
    if train_predict_m == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
        mode=train_predict_m,
        predictions={
            'pred_class': preds_cls,
            'pred_prob': tf.nn.softmax(input_logits)
        })
    tr_l = tf.losses.sparse_softmax_cross_entropy(labels=out_lb, logits=input_logits)
    if train_predict_m == tf.estimator.ModeKeys.TRAIN:
        adm_opt = tf.train.AdamOptimizer(learning_rate=0.01)
        tr_op = adm_opt.minimize(tr_l, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(train_predict_m, loss=tr_l, train_op=tr_op)
    eval_metric_ops = {'accuracy': tf.metrics.accuracy(labels=out_lb, predictions=preds_cls)}
    return tf.estimator.EstimatorSpec(train_predict_m, loss=tr_l, train_op=tr_op)

In [8]:
filt_sz=3
num_filters=5
def cnn_model_fn(features,labels,mode):
    news_word_vectors = tf.contrib.layers.embed_sequence(features[NEWS_FT], vocab_size=n_words, 
                                                         embed_dim=W_EMB_SZ)
    news_word_vectors = tf.expand_dims(news_word_vectors, -1)
    filt_shp = [filt_sz, W_EMB_SZ, 1, num_filters]
    Wt_vect = tf.truncated_normal(filt_shp, stddev=0.1)
    W = tf.Variable(Wt_vect, name="W")
    b_vect = tf.constant(0.1, shape=[num_filters])
    b = tf.Variable(b_vect, name="b")
    strides_vect = [1,1,1,1]
    conv1 = tf.nn.conv2d(news_word_vectors,
            W,
            strides=strides_vect,
            padding="VALID",
            name="conv1")
    relu1 = tf.nn.relu(tf.nn.bias_add(conv1, b), name="relu")
    ksize_vect = [1, average_title_sz - 3 + 1, 1, 1]
    pool1 = tf.nn.max_pool(
            relu1,
            ksize=ksize_vect,
            strides=strides_vect,
            padding='VALID',
            name="pool1")
    activations1 = tf.contrib.layers.flatten(pool1)
    logits = tf.contrib.layers.fully_connected(activations1,n_classes,activation_fn=None)
    return get_estimator_spec(input_logits=logits, out_lb=labels, train_predict_m=mode)

In [9]:
run_config = tf.contrib.learn.RunConfig()
run_config = run_config.replace(model_dir='/tmp/models/',save_summary_steps=10,log_step_count_steps=10)
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,config=run_config)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={NEWS_FT: X_train},
      y=y_train,
      batch_size=len(X_train),
      num_epochs=None,
      shuffle=True)
classifier.train(input_fn=train_input_fn, steps=100)

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11bf15198>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 10, '_save_checkpoints_secs': 600, '_log_step_count_steps': 10, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/models/'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/models/model.ckpt.
INFO:tensorflow:loss = 1.3967932, step = 1
INFO:tensorflow:global_step/sec: 0.0414788
INFO:tensorflow:global_step/sec: 0.0430198
INFO:tensorflow:Saving checkpoints for 27 into /tmp/models/model.ckpt.
INFO:tensorflow:global_step/sec: 0.0423592
INFO:tensorfl

<tensorflow.python.estimator.estimator.Estimator at 0x11bf15a58>

In [10]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={NEWS_FT: X_test},
      y=y_test,
      num_epochs=1,
      shuffle=False)
predictions = classifier.predict(input_fn=test_input_fn)
y_predicted = np.array(list(p['pred_class'] for p in predictions))
y_predicted = y_predicted.reshape(np.array(y_test).shape)
cls_mets = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(cls_mets))
print(metrics.confusion_matrix(y_test,y_predicted))

INFO:tensorflow:Restoring parameters from /tmp/models/model.ckpt-100
Accuracy: 0.919866
[[21014   542   331  1255]
 [  656 29274    37   598]
 [  647    99  8015   444]
 [ 1333   517   311 19411]]
