In [1]:
import numpy as np
import tensorflow as tf
import time
from faker import Faker
import babel
from babel.dates import format_date
import random

In [2]:
DATE_FORMATS = ['short',
           'medium',
           'long',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY',
           ]

LOCALES = babel.localedata.locale_identifiers()
LOCALES = [lang for lang in LOCALES if 'en' in str(lang)]


In [3]:
class DateGenerator():
    
    def __init__(self, LOCALES,  DATE_FORMATS, size):
        self.LOCALES = LOCALES
        self.DATE_FORMATS = DATE_FORMATS
        self.size = size
        self.x = []
        self.y = []
        
    def load_data(self):
        _fake = Faker()
        random.seed(101)
        random.seed(101)
        
        for i in range(self.size):
            date = _fake.date_object()
            gen_date = format_date(date, format=random.choice(self.DATE_FORMATS), locale = random.choice(self.LOCALES))
            machine_date = date.isoformat()
            self.x.append(gen_date)
            self.y.append(machine_date)
        
        return self.x, self.y

In [4]:
date_gen = DateGenerator(LOCALES, DATE_FORMATS, 50000)
x, y = date_gen.load_data()

In [5]:
x_chars = set(''.join(x))
x_char_to_ix = dict(zip(x_chars, range(len(x_chars))))

y_chars = set(''.join(y))
y_char_to_ix = dict(zip(y_chars, range(len(y_chars))))

In [6]:
# Padding
x_char_to_ix['<PAD>'] = len(x_char_to_ix)
x_ix_to_char = {i:char for char, i in x_char_to_ix.items()}
max_len = max([len(char) for char in x])
x_vec = [[x_char_to_ix['<PAD>']]*(max_len - len(date)) + [x_char_to_ix[char] for char in date] for date in x]
x_vec = np.array(x_vec)

In [7]:
# <GO> Padding
y_char_to_ix['<GO>'] = len(y_char_to_ix)
y_ix_to_char = {i:char for char, i in y_char_to_ix.items()}
y_vec = [[y_char_to_ix['<GO>']] + [y_char_to_ix[char] for char in date] for date in y]
y_vec = np.array(y_vec)

In [8]:
def generate_batch(x, y, batch_size):
    shuffle = np.random.permutation(len(x))
    start = 0
    x = x[shuffle]
    y = y[shuffle]
    
    while start + batch_size <= len(x) :
        yield x[start:start + batch_size], y[start:start + batch_size]
        start = (start + batch_size) % len(x)

In [9]:
x_seq = len(x_vec[0])
y_seq = len(y_vec[0]) - 1
input_len = len(x_char_to_ix)
output_len = len(y_char_to_ix)
lstm_units = 32
batch_size = 128

embed_size = 10

tf.reset_default_graph()


# PLACEHOLDERS
inputs = tf.placeholder(tf.int32, shape = [None, x_seq], name = 'inputs')
dec_inputs = tf.placeholder(tf.int32, shape = [None, None], name = 'dec_inputs')
targets = tf.placeholder(tf.int32, shape = [None, None], name = 'targets')


# EMBEDDINGS
input_embed = tf.Variable(tf.random_uniform([input_len, embed_size], -1, 1), name = 'input_embed')
output_embed = tf.Variable(tf.random_uniform([output_len, embed_size], -1, 1), name = 'output_embed')

input_date_embed = tf.nn.embedding_lookup(input_embed, inputs)
dec_inputs_date_embed = tf.nn.embedding_lookup(output_embed, dec_inputs)


# ENCODER
with tf.variable_scope('encoder'):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
    _, last_state = tf.nn.dynamic_rnn(lstm_cell, inputs = input_date_embed, dtype = tf.float32)

    
# DECODER
with tf.variable_scope('decoder'):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
    decoder_outputs, _ = tf.nn.dynamic_rnn(lstm_cell, inputs = dec_inputs_date_embed, initial_state = last_state, dtype = tf.float32)
    

# FINAL LAYER
logits = tf.layers.dense(decoder_outputs, output_len)


# LOSS and OPTIMIZER
loss = tf.contrib.seq2seq.sequence_loss(logits, targets, weights = tf.ones([batch_size, y_seq]))
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3)
train = optimizer.minimize(loss)

# ACCURACY
y_pred = tf.argmax(logits, axis = -1)
matches = tf.equal(tf.cast(y_pred, dtype = tf.int32), tf.cast(targets, dtype = tf.int32))
accuracy = tf.reduce_mean(tf.cast(matches, dtype=tf.float32))

In [10]:
train_size = int(x_vec.shape[0] * 0.7)
X_train, y_train = x_vec[:train_size], y_vec[:train_size]
X_test, y_test = x_vec[train_size:], y_vec[train_size:]

num_epochs = 10
saver = tf.train.Saver()

In [11]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        start_time = time.time()
        training_accuracy = 0
        training_loss = 0
        i = 0
        for batch_x, batch_y in generate_batch(X_train, y_train, batch_size):
            i += 1
            batch_accuracy, batch_loss, _= sess.run([accuracy, loss, train], feed_dict = {inputs: batch_x, dec_inputs: batch_y[:, :-1], targets: batch_y[:, 1:]})
            training_loss += batch_loss
            training_accuracy += batch_accuracy
            
        training_loss /= i
        training_accuracy /= i
        print('Epoch: {} \t Time: {:>6.3f}'.format(epoch+1, time.time() - start_time))
        print('---- Loss: {:>6.3f} \t Training Accuracy: {:>6.3f}'.format(training_loss, training_accuracy))
        print('\n')        
        
    saver.save(sess, 'models/model-{}.ckpt'.format(num_epochs))

Epoch: 1 	 Time:  7.608
---- Loss:  1.487 	 Training Accuracy:  0.505


Epoch: 2 	 Time:  7.101
---- Loss:  0.776 	 Training Accuracy:  0.734


Epoch: 3 	 Time:  7.128
---- Loss:  0.519 	 Training Accuracy:  0.840


Epoch: 4 	 Time:  7.097
---- Loss:  0.357 	 Training Accuracy:  0.895


Epoch: 5 	 Time:  7.114
---- Loss:  0.252 	 Training Accuracy:  0.928


Epoch: 6 	 Time:  7.300
---- Loss:  0.182 	 Training Accuracy:  0.949


Epoch: 7 	 Time:  7.163
---- Loss:  0.137 	 Training Accuracy:  0.961


Epoch: 8 	 Time:  7.198
---- Loss:  0.106 	 Training Accuracy:  0.970


Epoch: 9 	 Time:  7.288
---- Loss:  0.085 	 Training Accuracy:  0.977


Epoch: 10 	 Time:  7.317
---- Loss:  0.069 	 Training Accuracy:  0.982




In [14]:
# Testing

test_batch_x, test_batch_y = next(generate_batch(X_test, y_test, batch_size))

with tf.Session() as sess:
    saver.restore(sess, 'models/model-{}.ckpt'.format(num_epochs))
    dec_input = np.zeros((batch_size, 1)) + y_char_to_ix['<GO>']
    
    for i in range(y_seq):
        batch_logits = sess.run(logits, feed_dict = {inputs: test_batch_x, dec_inputs: dec_input})
        pred = batch_logits[:, -1].argmax(axis = -1).reshape(-1, 1)
        dec_input = np.hstack([dec_input, pred])

    print('Test Accuracy: {:>6.3f}'.format(np.mean(dec_input == test_batch_y)))

INFO:tensorflow:Restoring parameters from models/model-10.ckpt
Test Accuracy:  0.977


In [15]:
######################################################### END ##################################################################