In [2]:
import numpy as np
import pandas
import tensorflow as tf
from importlib import reload
import graph; reload(graph)
from w266_common import patched_numpy_io

  from ._conv import register_converters as _register_converters


In [3]:
# LOAD THE DATA
fileName = "data/encoded_data/embeddings_w266.npy"
encodedData = np.load(fileName)
encodedData = pandas.DataFrame(data = encodedData, columns = ["x_"+str(i) for i in range(500)])
filename = "data/Labeled_Colorado_Flu_Study_Tweets_AvI_RvN_SvO.csv"
coloradoData = pandas.read_csv(filename, sep="\t")
print(encodedData.shape)
allData = pandas.concat([coloradoData, encodedData], axis = 1)
print(allData.shape)

# get all data with labels present for the column we care about
allDataVal = allData.dropna(subset=["Related_Label"])
print(allDataVal.shape)

(5270, 500)
(5270, 505)
(4413, 505)


In [4]:
allDataVal[:3]

Unnamed: 0,Tweet_ID,Tweet_Content,Awareness_Label,Related_Label,Self_Label,x_0,x_1,x_2,x_3,x_4,...,x_490,x_491,x_492,x_493,x_494,x_495,x_496,x_497,x_498,x_499
1,5222838706,muh. if i am getting sick and it's not swine f...,0.0,1.0,1.0,-0.174253,0.130687,-1.384781,0.459587,-0.777744,...,0.548753,1.287073,0.312468,-0.465325,2.188294,-0.027273,0.11358,-1.535737,-0.806254,-1.206642
3,5918860304,"getting better,no more piggy flu 4 me,it was n...",0.0,1.0,1.0,2.034387,0.432776,-0.038979,-0.650743,0.067163,...,0.699051,1.544988,0.092756,-1.447382,0.576435,1.077597,-0.316096,1.288783,-1.124577,-0.827067
4,4631607800,@robbsterr yay for man txting you.. in other n...,0.0,0.0,1.0,-0.469187,1.415714,-1.875996,0.901807,0.076518,...,0.74441,0.573868,0.070829,-0.891997,0.930053,-0.876152,0.057401,-0.118646,0.211558,-1.097714


In [5]:
# extract X and Y as np arrays, so that we can feed them to tensors. 
x = allDataVal[["x_"+str(i) for i in range(500)]]
y = allDataVal["Related_Label"]
print(x.values.shape)
print(y.values.shape)

(4413, 500)
(4413,)


In [7]:
# split into train, test, dev (80%,10%,10%)
np.random.seed(42)
train_x_df, test_x_df, dev_x_df = np.split(x.sample(frac=1), [int(.8*len(x)), int(.9*len(x))])
train_y_df = y[train_x_df.index]
test_y_df = y[test_x_df.index]
dev_y_df = y[dev_x_df.index]

# convert to numpy arrays
train_x, test_x, dev_x, train_y, test_y, dev_y = train_x_df.values, test_x_df.values, dev_x_df.values, train_y_df.values, test_y_df.values, dev_y_df.values 
train_y = train_y.astype(int)
dev_y = dev_y.astype(int)
test_y = test_y.astype(int)
train_x = train_x.astype("float32")
test_x = test_x.astype("float32")
dev_x = dev_x.astype("float32")
print(type(train_x))
print(train_x.shape)
print(dev_x.shape)
print(test_x.shape)
print(train_y)

<class 'numpy.ndarray'>
(3530, 500)
(442, 500)
(441, 500)
[0 0 0 ... 0 1 1]


In [8]:
# setting up model params, model function, and input function
reload(graph)

# Specify model hyperparameters as used by model_fn
model_params = dict(hidden_dims=[75, 75, 75, 75], num_classes=2,
                    lr=0.1, optimizer='adagrad', beta=0.1,dropout_rate = 0.5, maxGradNorm = 1.0)

model = tf.estimator.Estimator(model_fn=graph.classifierModelFn, 
                               params=model_params)

# Training params for input function
train_params = dict(batch_size=32, total_epochs=20, eval_every=2)

# set up input function
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"x": train_x}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"x": dev_x}, y=dev_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpbujxdkvb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6b93a74d30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
# Training the actual model
for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train
    model.train(input_fn=train_input_fn)
    
    # evaluate on dev set
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpbujxdkvb/model.ckpt.
INFO:tensorflow:loss = 996.2897, step = 1
INFO:tensorflow:global_step/sec: 274.676
INFO:tensorflow:loss = 930.8564, step = 101 (0.366 sec)
INFO:tensorflow:global_step/sec: 426.869
INFO:tensorflow:loss = 925.10785, step = 201 (0.234 sec)
INFO:tensorflow:Saving checkpoints for 221 into /tmp/tmpbujxdkvb/model.ckpt.
INFO:tensorflow:Loss for final step: 589.81476.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-22-13:38:01
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbujxdkvb/model.ckpt-221
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 1547 into /tmp/tmpbujxdkvb/model.ckpt.
INFO:tensorflow:Loss for final step: 589.41644.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-22-13:38:18
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbujxdkvb/model.ckpt-1547
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-22-13:38:18
INFO:tensorflow:Saving dict for global step 1547: accuracy = 0.7443439, cross_entropy_loss = 0.5545915, global_step = 1547, loss = 3160.1765
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbujxdkvb/model.ckpt-1547
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints 

In [10]:
# evaluate on test set
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"x": test_x}, y=test_y,
                    batch_size=128, num_epochs=1, shuffle=False
                )

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")

#### END(YOUR CODE) ####
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-22-13:38:33
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbujxdkvb/model.ckpt-2210
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-22-13:38:34
INFO:tensorflow:Saving dict for global step 2210: accuracy = 0.70068026, cross_entropy_loss = 0.58242965, global_step = 2210, loss = 3175.2363
Accuracy on test set: 70.07%


{'accuracy': 0.70068026,
 'cross_entropy_loss': 0.58242965,
 'global_step': 2210,
 'loss': 3175.2363}

In [13]:
# do some predicting, so we can see which examples fail:
predictions = list(model.predict(test_input_fn))  # list of dicts
falsePositives = []
falseNegatives = []
for idx,i in enumerate(test_x_df.index.values):
    predictedLabel = predictions[idx]["max"]
    datarow = allData.iloc[i][["Tweet_Content", "Related_Label"]]
    trueLabel = datarow["Related_Label"]
    if int(trueLabel) == 1:
        if int(predictedLabel)==0:
            falseNegatives.append((datarow["Tweet_Content"], trueLabel, predictedLabel))
    elif int(trueLabel) == 0:
        if int(predictedLabel) == 1:
            falsePositives.append((datarow["Tweet_Content"], trueLabel, predictedLabel))
print()
print("All Test data #:"+str(len(test_x_df.index.values)))
print("False Positives:"+str(int(len(falsePositives)*100.0/len(test_x_df.index.values)))+"%")
print("False Negatives:"+str(int(len(falseNegatives)*100.0/len(test_x_df.index.values)))+"%")
print()
print("Some false positives (tweet, true label, predicted label):")
for row in falsePositives[:5]:
    print(row)
print()
print("Some false negatives (tweet, true label, predicted label):")
for row in falseNegatives[:20]:
    print(row)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbujxdkvb/model.ckpt-2210
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

All Test data #:441
False Positives:17%
False Negatives:12%

Some false positives (tweet, true label, predicted label):
('getting sick :| i think ima get the turtle flu lmfaaooo', 0.0, 1)
("taking the compact carrs for their piggy flu shots. i'm so against the idea, this better not make them sick! #fb #worried #hateshots", 0.0, 1)
("i want to see obama, his lovely bride and kids get the flu shot first... i don't trust them... talk about fear tactics!  wake up america!", 0.0, 1)
('getting flu shots on air at 5:07 today.', 0.0, 1)
("my irrational fear of needles is not helping me get my act together and get a flu shot today... :'(", 0.0, 1)

Some false negatives (tweet, true label, predicted label):
('#caringcurrents #h1n1 