In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import random
import json
import os,time
from faker import Faker
import babel
from babel.dates import format_date
import tensorflow as tf
import tensorflow.contrib.legacy_seq2seq as seq2seq
from os.path import isfile, isdir, getsize
from tqdm import tqdm
import zipfile
from urllib import urlretrieve
from IPython.display import clear_output, Image, display, HTML
from sklearn.model_selection import train_test_split


In [2]:
# This cell contains helper methods..
class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num


def downloadData(file, url):        
    if not isfile(file):
        with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Fake News Dataset') as pbar:
            urlretrieve(url, file, pbar.hook)

    with zipfile.ZipFile(file) as f:
        f.extractall('./data/')
        

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def


def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [3]:
fake = Faker()
fake.seed(42)
random.seed(42)
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY',
           ]

# change this if you want it to work with only a single language
LOCALES = babel.localedata.locale_identifiers()
LOCALES = [lang for lang in LOCALES if 'en' in str(lang)]

In [4]:
def create_date():
    """
        Creates some fake dates 
        :returns: tuple containing 
                  1. human formatted string
                  2. machine formatted string
                  3. date object.
    """
    dt = fake.date_object()
    try:
        human = format_date(dt,
                            format=random.choice(FORMATS),
                            locale=random.choice(LOCALES))

        case_change = random.randint(0,3) # 1/2 chance of case change
        if case_change == 1:
            human = human.upper()
        elif case_change == 2:
            human = human.lower()
        machine = dt.isoformat()
    except AttributeError as e:
        return None, None, None

    return human, machine #, dt

data = [create_date() for _ in range(50000)]

In [5]:
data[:5]

[(u'22, OCT 2000', '2000-10-22'),
 (u'wednesday, 17 march 1971', '1971-03-17'),
 (u'APRIL 2, 1983', '1983-04-02'),
 (u'02/10/1980', '1980-10-02'),
 (u'26/06/2005', '2005-06-26')]

In [6]:
#Now extract the src and targets into seperate lists

source_list = [tpl1 for tpl1,tpl_2 in  data]
target_list = [tpl_2 for tpl1,tpl_2 in  data]

In [7]:
# Now we will find the unique characters coming in the date format
#1. To get lookup for src -> num 
unique_chars_src = set(' '.join(source_list))
char_num_dict_src = dict(zip(unique_chars_src,range(len(unique_chars_src))))
char_num_dict_src['<PAD_VAR>']=len(char_num_dict_src)
# Reverse lookup
num_char_dict_src= dict((v, k) for k, v in char_num_dict_src.iteritems())


#2. To get lookup for dest -> num
unique_chars_dest = set(' '.join(target_list))
char_num_dict_dest = dict(zip(unique_chars_dest,range(len(unique_chars_dest))))


In [8]:
# Now we have to do static padding.. We will also demo the same with dynamic padding and bucket later..
max_src_len = max([len(date) for date in source_list])
x = [[char_num_dict_src['<PAD_VAR>']]* (max_src_len -len(date)) + [char_num_dict_src[cur_char] for cur_char in date] for date in source_list]
x = np.array(x)



In [9]:
#Do the STOP pad for target also
char_num_dict_dest['<STOP>'] = len(char_num_dict_dest)
# Reverse lookup
num_char_dict_dest = dict((v, k) for k, v in char_num_dict_dest.iteritems())
y = [[char_num_dict_dest['<STOP>']]+[char_num_dict_dest[cur_char] for cur_char in date] for date in target_list]
y=np.array(y)

In [10]:
x_seq_length = len(x[0])
y_seq_length  = len(y[0])-1 # Stop  pad is added

In [12]:
def batch_data(x,y,batch_size):
    start = 0
    shuffle = np.random.permutation(len(x))
    x = x[shuffle]
    y=y[shuffle]
    while start+batch_size <= len(x):
        yield x[start:start+batch_size], y[start:start+batch_size]
        start += batch_size


In [14]:
epochs = 2
batch_size = 128
nodes = 32
embed_size = 10
tf.reset_default_graph()
sess = tf.InteractiveSession()
learning_rate = 1e-3

# The data feeder placeholders..
inputs = tf.placeholder(tf.int32,shape = (None,x_seq_length),name='inputs')
outputs = tf.placeholder(tf.int32,shape=(None,None),name='outputs')
targets = tf.placeholder(tf.int32,shape=(None,None),name='targets')


#Embedding layers
input_embedding = tf.Variable(tf.random_uniform((len(char_num_dict_src),embed_size),-1.0,1.0),name='enc_embedding')
output_embedding = tf.Variable(tf.random_uniform((len(char_num_dict_dest),embed_size),-1.0,1.0),name='dec_embedding')

# Now look ups

date_input_embed = tf.nn.embedding_lookup(input_embedding,inputs)
date_output_embed = tf.nn.embedding_lookup(output_embedding,outputs)

with tf.variable_scope("encoding") as encoding_scope:
    lstm_enc = tf.nn.rnn_cell.BasicLSTMCell(nodes)
    _,last_state = tf.nn.dynamic_rnn(lstm_enc,inputs=date_input_embed,dtype=tf.float32)

with tf.variable_scope("decoding") as decoding_scope:
    lstm_dec = tf.nn.rnn_cell.BasicLSTMCell(nodes)
    dec_outputs,_ = tf.nn.dynamic_rnn(lstm_dec,inputs=date_output_embed,initial_state=last_state)
    
logits = tf.contrib.layers.fully_connected(dec_outputs,num_outputs=len(char_num_dict_dest),activation_fn=None)

with tf.name_scope("optimization"):
    loss = tf.contrib.seq2seq.sequence_loss(logits,targets,tf.ones([batch_size,y_seq_length]))
    optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

In [15]:
dec_outputs.get_shape().as_list()

[None, None, 32]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
#sess.run(tf.global_variables_initializer())
epochs = 10
for epoch_i in range(epochs):
    start_time = time.time()
    for batch_i, (source_batch, target_batch) in enumerate(batch_data(X_train, y_train, batch_size)):
        _, batch_loss, batch_logits = sess.run([optimizer, loss, logits],
            feed_dict = {inputs: source_batch,
             outputs: target_batch[:, :-1],
             targets: target_batch[:, 1:]})
    accuracy = np.mean(batch_logits.argmax(axis=-1) == target_batch[:,1:])
    print('Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} Epoch duration: {:>6.3f}s'.format(epoch_i, batch_loss, 
                                                                      accuracy, time.time() - start_time))

In [24]:
source_batch, target_batch = next(batch_data(X_test, y_test, batch_size))
dec_input = np.zeros((len(source_batch), 1)) + char_num_dict_dest['<STOP>']
for i in range(y_seq_length):
    batch_logits = sess.run(logits,
                feed_dict = {inputs: source_batch,
                 outputs: dec_input})
    prediction = batch_logits[:,-1].argmax(axis=-1)
    dec_input = np.hstack([dec_input, prediction[:,None]])
    
print('Accuracy on test set is: {:>6.3f}'.format(np.mean(dec_input == target_batch)))



num_preds = 2
source_chars = [[num_char_dict_src[l] for l in sent if num_char_dict_src[l]!="<PAD>"] for sent in source_batch[:num_preds]]
dest_chars = [[num_char_dict_dest[l] for l in sent] for sent in dec_input[:num_preds, 1:]]

for date_in, date_out in zip(source_chars, dest_chars):
    print(''.join(date_in)+' => '+''.join(date_out))

Accuracy on test set is:  0.896
<PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR>26 february, 1995 => 1995-02-02
<PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR><PAD_VAR>January 11 1986 => 1986-01-11
