In [113]:
import os
import re
import string
import requests
import collections
import io
import csv
import nltk
import random

import numpy as np
import scipy as sp
import tensorflow as tf
import shutil
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.layers as tflayers
import tensorflow.contrib.metrics as metrics
import tensorflow.contrib.rnn as rnn

from tensorflow.contrib.learn.python.learn import learn_runner
from tensorflow.python.framework import ops

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab as pl 

%matplotlib inline

ops.reset_default_graph()

In [126]:
SEQ_LEN = 10
DEFAULTS = [[0.0] for x in xrange(0, SEQ_LEN)]
BATCH_SIZE = 20
TIMESERIES_COL = 'rawdata'
N_OUTPUTS = 2  # in each sequence, 1-8 are features, and 9-10 is label
N_INPUTS = SEQ_LEN - N_OUTPUTS

## Generate some sample time series data

In [121]:
def create_time_series():
    freq = (np.random.random()*0.5) + 0.1  # 0.1 to 0.6
    ampl = np.random.random() + 0.5  # 0.5 to 1.5
    x = np.sin(np.arange(0,SEQ_LEN) * freq) * ampl
    return x

def to_csv(filename, N):
  with open(filename, 'w') as ofp:
    for lineno in xrange(0, N):
      seq = create_time_series()
      line = ",".join(map(str, seq))
      ofp.write(line + '\n')

to_csv('train.csv', 1000)  # 1000 sequences
to_csv('valid.csv',  50)

In [118]:
# read data and convert to needed format
def read_dataset(filename, mode=tf.contrib.learn.ModeKeys.TRAIN):  
    def _input_fn():
        num_epochs = 100 if mode == tf.contrib.learn.ModeKeys.TRAIN else 1

        # could be a path to one file or a file pattern.
        input_file_names = tf.train.match_filenames_once(filename)

        filename_queue = tf.train.string_input_producer(
            input_file_names, num_epochs=num_epochs, shuffle=True)
        reader = tf.TextLineReader()
        _, value = reader.read_up_to(filename_queue, num_records=BATCH_SIZE)

        value_column = tf.expand_dims(value, -1)
        print 'readcsv={}'.format(value_column)

        # all_data is a list of tensors
        all_data = tf.decode_csv(value_column, record_defaults=DEFAULTS)  
        inputs = all_data[:len(all_data)-N_OUTPUTS]  # first few values
        label = all_data[len(all_data)-N_OUTPUTS : ] # last few values

        # from list of tensors to tensor with one more dimension
        inputs = tf.concat(inputs, axis=1)
        label = tf.concat(label, axis=1)
        print 'inputs={}'.format(inputs)

        return {TIMESERIES_COL: inputs}, label   # dict of features, label
    return _input_fn

def get_train_d():
    return read_dataset('train.csv', mode=tf.contrib.learn.ModeKeys.TRAIN)

def get_valid():
    return read_dataset('valid.csv', mode=tf.contrib.learn.ModeKeys.EVAL)

## Constructing RNN with the LSTM architecture

In [125]:
LSTM_SIZE = 3  # number of hidden layers in each of the LSTM cells

# create the inference model
def simple_rnn(inputs):
    # Reshape input shape to become a sequence
    x = tf.split(inputs[TIMESERIES_COL], N_INPUTS, 1)
    print 'x={}'.format(x)

    # Configure the RNN
    lstm_cell = rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1.0)
    
    # We only need the output and can ignore the state
    outputs, _ = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # slice to keep only the last cell of the RNN
    last_output = outputs[-1]
    print 'last outputs={}'.format(last_output)

    # output is result of linear activation of last layer of RNN
    weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))
    bias = tf.Variable(tf.random_normal([N_OUTPUTS]))
    predictions = tf.matmul(last_output, weight) + bias
    
    return predictions

In [1]:
# Define RNN Model with the LSTM architecture
class LSTM_Model():
    def __init__(self, num_units, batch_size, learning_rate, train_seq_len,
                 vocab_size, infer_sample=False):
        """
        Args:
          num_units:      The number of hidden units in the LSTM cell (same as num_units in BasicLSTMCell)
                          or rather it is the size of the state of the LSTM cell. Note: LSTM cell outputs
                          two things at each time step: the output h_t and the cell state C_t.
              
          forget_bias:    The extra bias to be added to forget gates's bias. It is done to in order 
                          to reduce the scale of forgetting in the beginning of the training.
                       
          
          batch_size:     Number of examples to train on at once
          
          learning_rate:  
          train_seq_len:  The length of the surrounding word group
          vocab_size:     
          infer_sample:
        """
        self.num_units = num_units
        self.vocab_size = vocab_size
        self.infer_sample = infer_sample
        self.learning_rate = learning_rate
        
        if infer_sample:
            self.batch_size = 1
            self.train_seq_len = 1
        else:
            self.batch_size = batch_size
            self.train_seq_len = train_seq_len
        
        self.lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units)
        self.initial_state = self.lstm_cell.zero_state(self.batch_size, tf.float32)
        
        self.x_data = tf.placeholder(tf.int32, [self.batch_size, self.train_seq_len])
        self.y_output = tf.placeholder(tf.int32, [self.batch_size, self.train_seq_len])
        
        with tf.variable_scope('lstm_vars'):
            # Softmax Output Weights
            W = tf.get_variable('W', [self.num_units, self.vocab_size], tf.float32, tf.random_normal_initializer())
            b = tf.get_variable('b', [self.vocab_size], tf.float32, tf.constant_initializer(0.0))
        
            # Define Embedding
            embedding_mat = tf.get_variable('embedding_mat', [self.vocab_size, self.num_units],
                                            tf.float32, tf.random_normal_initializer())
                                            
            embedding_output = tf.nn.embedding_lookup(embedding_mat, self.x_data)
            rnn_inputs = tf.split(axis=1, num_or_size_splits=self.train_seq_len, value=embedding_output)
            rnn_inputs_trimmed = [tf.squeeze(x, [1]) for x in rnn_inputs]
        
        # If we are inferring (generating text), we add a 'loop' function
        # Define how to get the i+1 th input from the i th output
        def inferred_loop(prev, count):
            # Apply hidden layer
            prev_transformed = tf.matmul(prev, W) + b
            # Get the index of the output (also don't run the gradient)
            prev_symbol = tf.stop_gradient(tf.argmax(prev_transformed, 1))
            # Get embedded vector
            output = tf.nn.embedding_lookup(embedding_mat, prev_symbol)
            return(output)
        
        decoder = tf.contrib.legacy_seq2seq.rnn_decoder
        outputs, last_state = decoder(rnn_inputs_trimmed,
                                      self.initial_state,
                                      self.lstm_cell,
                                      loop_function=inferred_loop if infer_sample else None)
        # Non inferred outputs
        output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, self.num_units])
        # Logits and output
        self.logit_output = tf.matmul(output, W) + b
        self.model_output = tf.nn.softmax(self.logit_output)
        
        loss_fun = tf.contrib.legacy_seq2seq.sequence_loss_by_example
        loss = loss_fun([self.logit_output],[tf.reshape(self.y_output, [-1])],
                [tf.ones([self.batch_size * self.train_seq_len])],
                self.vocab_size)
        self.cost = tf.reduce_sum(loss) / (self.batch_size * self.train_seq_len)
        self.final_state = last_state
        gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tf.trainable_variables()), 4.5)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(gradients, tf.trainable_variables()))   