In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

import data_preparation as sf
import rnn_enc_dec as red


np.set_printoptions(formatter={'float_kind': (lambda x: "%.2f" % x)})

PROJECT_ROOT_DIR = "/home/ubuntu/TroubledLife"
#PROJECT_ROOT_DIR = "/Users/gopora/MyStuff/Dev/Workspaces/Sandbox/TroubledLife"
DATASETS_DIR = os.path.join(PROJECT_ROOT_DIR, "data")
TF_LOG_DIR = os.path.join(PROJECT_ROOT_DIR, "tf_logs")
MODEL_CHECKPOINTS_DIR = os.path.join(PROJECT_ROOT_DIR, "model_checkpoints")
TRAINING_SET_DATA_FILE = "troubled_life_policy_train_data.csv"
TEST_SET_DATA_FILE = "troubled_life_policy_test_data.csv"

now = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
log_dir = "{}/run-{}/".format(TF_LOG_DIR, now)

In [2]:
policy_histories_train = \
    sf.load_life_policy_data(file_path=os.path.join(DATASETS_DIR, TRAINING_SET_DATA_FILE))

policy_histories_test = \
    sf.load_life_policy_data(file_path=os.path.join(DATASETS_DIR, TEST_SET_DATA_FILE))

policy_histories_length_train, max_policy_history_length_train = \
    sf.get_policy_history_lengths(policy_histories=policy_histories_train)

policy_histories_length_test, max_policy_history_length_test = \
    sf.get_policy_history_lengths(policy_histories=policy_histories_test)

max_policy_history_length = max(max_policy_history_length_train, max_policy_history_length_test)

In [3]:
# Extract features and labels from dataset as numpy.ndarray(s)
train_labels, train_features, train_seq_lengths =\
    sf.prepare_labels_features_lengths(policy_histories=policy_histories_train,
                                       policy_histories_lengths=policy_histories_length_train, 
                                       max_policy_history_length=max_policy_history_length, binary_classification=True)
test_labels, test_features, test_seq_lengths =\
    sf.prepare_labels_features_lengths(policy_histories=policy_histories_test,
                                       policy_histories_lengths=policy_histories_length_test, 
                                       max_policy_history_length=max_policy_history_length, binary_classification=True)

In [4]:
import tensorflow as tf


tf.reset_default_graph()
tf.set_random_seed(42)

# Our hyperparameters
n_steps = max_policy_history_length
n_inputs = 2
n_layers = 3
n_neurons = 200
n_latent = 20

# Placeholders for our input sequences
seq_length = tf.placeholder(tf.int32, [None], name="seq_length")
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs], name="X")

latent_vector = red.encoder(X=X, seq_length=seq_length, n_layers=n_layers, n_neurons=n_neurons, n_latent=n_latent)

decoder_sequence_test = red.decoder(n_latent=n_latent, n_layers=n_layers, n_neurons=n_neurons, n_outputs=n_inputs, 
                               latent_vector=latent_vector, X=X, seq_length=seq_length, training=True, reuse=False)

tvars = tf.trainable_variables()

for var in tvars:
    print(var) 
    
decoder_loss_per_policy = tf.reduce_mean(tf.reduce_mean(tf.square(decoder_sequence_test - X), axis=2), axis=1)
decoder_loss_overall = tf.reduce_mean(tf.square(decoder_sequence_test - X))

# Both are needed when using precision and recall metrics
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()

saver = tf.train.Saver(var_list=tvars)

encoder states shape: (?, 600)
latent_vector shape: (?, 20)
state_input shape: (?, 200)
decoder initial state shape:  (?, 600)
decoder outputs rnn_output shape: (?, ?, 2)
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_0/basic_rnn_cell/kernel:0' shape=(202, 200) dtype=float32_ref>
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_0/basic_rnn_cell/bias:0' shape=(200,) dtype=float32_ref>
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_1/basic_rnn_cell/kernel:0' shape=(400, 200) dtype=float32_ref>
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_1/basic_rnn_cell/bias:0' shape=(200,) dtype=float32_ref>
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_2/basic_rnn_cell/kernel:0' shape=(400, 200) dtype=float32_ref>
<tf.Variable 'P_Encoder/rnn/multi_rnn_cell/cell_2/basic_rnn_cell/bias:0' shape=(200,) dtype=float32_ref>
<tf.Variable 'P_Encoder_2_Latent/W:0' shape=(600, 20) dtype=float32_ref>
<tf.Variable 'P_Encoder_2_Latent/b:0' shape=(20,) dtype=float32_ref>
<tf.Variable 'Latent_2_P_Decoder/W:0' s

In [5]:
with tf.Session() as sess:
    init_g.run()
    init_l.run()

    saver.restore(sess, os.path.join(MODEL_CHECKPOINTS_DIR, "rnn_enc_dec.ckpt"))    
    
    lPolicy, lOverall = sess.run([decoder_loss_per_policy, decoder_loss_overall], 
                                 feed_dict={X: test_features, seq_length: test_seq_lengths})

INFO:tensorflow:Restoring parameters from /home/ubuntu/TroubledLife/model_checkpoints/rnn_enc_dec.ckpt


In [34]:
troubled_threshold = 3000

label = (test_labels > 0)
label_pred = (lPolicy > troubled_threshold)
certainty = abs(troubled_threshold - lPolicy) / troubled_threshold
pred_correct = (label_pred == label)

accuracy = np.mean(pred_correct)

test_result = pd.DataFrame(
    {'id': policy_histories_test.index.levels[0], 'label': label, 'loss_policy':lPolicy, 'label_pred': label_pred, 'pred_correct': pred_correct,
     'certainty': certainty})
test_result.set_index('id', inplace=True)

print(accuracy)
print(classification_report(label, label_pred, target_names=['False', 'True']))
print(confusion_matrix(label, label_pred))

test_result.loc[test_result["pred_correct"] == False]

0.9985
             precision    recall  f1-score   support

      False       1.00      1.00      1.00      1538
       True       0.99      1.00      1.00       462

avg / total       1.00      1.00      1.00      2000

[[1535    3]
 [   0  462]]


Unnamed: 0_level_0,certainty,label,label_pred,loss_policy,pred_correct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1037,0.14877,False,True,3446.309326,False
1251,0.228918,False,True,3686.755371,False
2461,33.276119,False,True,102828.359375,False


In [35]:
test_result.loc[test_result["certainty"] < 0.3]

Unnamed: 0_level_0,certainty,label,label_pred,loss_policy,pred_correct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1037,0.14877,False,True,3446.309326,False
1172,0.095845,True,True,3287.534668,True
1251,0.228918,False,True,3686.755371,False
1432,0.146479,False,False,2560.564453,True
2014,0.032981,False,False,2901.057129,True
2863,0.201329,False,False,2396.011719,True
2895,0.050356,False,False,2848.931152,True
2914,0.250855,False,False,2247.433838,True
