In [1]:
# Imports 
import pandas as pd
import numpy as np
import tensorflow as tf
# from nltk.corpus import stopwords
# import re
from collections import Counter
# import operator
from tensorflow.python.layers.core import Dense

In [2]:
#Read TED talk Dataset
def read_ted_talk():
    file = open("content.txt","r")
    return file.readlines()

In [3]:
raw_talks = read_ted_talk()[:2000]

In [4]:
test_talk = raw_talks[0]
print(test_talk)

Here are two reasons companies fail: they only do more of the same, or they only do what's new. To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing. Consider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. And what did Facit do when the electronic calculator came along? They continued doing exactly the same. In six months, they went from maximum revenue ... and they were gone. Gone. To me, the irony about the Facit story is hearing about the Facit engineers, who had bought cheap, small electronic calculators in Japan that they used to double-check their calculators. (Laughter) Facit did too much exploitation. But exploration can go wild, too. A few years back, I worked closely alongside a European 

In [5]:
def preprocess_talks(talk_content):
    talk_content = talk_content.replace("...","")
    talk_content = talk_content.replace("."," . ")
    talk_content = talk_content.replace("?"," ? ")
    talk_content = talk_content.replace("!"," ! ")
    talk_content = talk_content.replace(";"," ; ")
    talk_content = talk_content.replace(","," , ")
    talk_content = talk_content.replace("?"," ? ")
    talk_content = talk_content.replace(":"," : ")
    talk_content = talk_content.replace("("," ( ")
    talk_content = talk_content.replace(")"," ) ")
    
    
    talk_content = talk_content.replace("\""," \" ")
    talk_content = talk_content.replace("--","")
    talk_content = talk_content.replace("\n"," \n ")
    return talk_content

In [6]:
preprocess_talks(raw_talks[0])

'Here are two reasons companies fail :  they only do more of the same ,  or they only do what\'s new .  To me the real ,  real solution to quality growth is figuring out the balance between two activities :  exploration and exploitation .  Both are necessary ,  but it can be too much of a good thing .  Consider Facit .  I\'m actually old enough to remember them .  Facit was a fantastic company .  They were born deep in the Swedish forest ,  and they made the best mechanical calculators in the world .  Everybody used them .  And what did Facit do when the electronic calculator came along  ?   They continued doing exactly the same .  In six months ,  they went from maximum revenue  and they were gone .  Gone .  To me ,  the irony about the Facit story is hearing about the Facit engineers ,  who had bought cheap ,  small electronic calculators in Japan that they used to double-check their calculators .   ( Laughter )  Facit did too much exploitation .  But exploration can go wild ,  too .

In [7]:
processed_talks = [preprocess_talks(talk) for talk in raw_talks]

In [8]:
def tokenize_talks(talks):
    tokenized_talks = []
    for talk in talks:
        tokenized_talks.append(talk.lower().split())
    return tokenized_talks

In [9]:
tokenized_talks = tokenize_talks(processed_talks)

In [10]:
def create_vocab(talks, keep_count = 10):
    '''
    Param : talks, keep_count = 10
    Return : vocab, vocab_to_int, int_to_vocab
    '''
    vocab = Counter()
    for talk in talks:
        vocab += Counter(talk)
    
    #Remove tokens from vocab having less than 10 counts
    vocab = [token for token,count in vocab.items() if count > keep_count]
    int_to_vocab = {i:word for i,word in enumerate(vocab)}
    vocab_to_int = {word:i for i,word in int_to_vocab.items()}
    
    return vocab, vocab_to_int, int_to_vocab

In [11]:
vocab,vocab_to_int,int_to_vocab = create_vocab(tokenized_talks)

In [12]:
int_to_vocab

{0: 'pretty',
 1: 'feel',
 2: 'fuel',
 3: 'barely',
 4: 'fgm',
 5: 'sports',
 6: 'advocates',
 7: 'tragically',
 8: 'minimize',
 9: 'certainty',
 10: 'stocks',
 11: 'shrinking',
 12: 'never',
 13: 'rounded',
 14: 'correlated',
 15: 'blind',
 16: 'golden',
 17: 'straw',
 18: 'whereas',
 19: 'exquisitely',
 20: 'damaged',
 21: 'tiger',
 22: "you've",
 23: 'stain',
 24: 'grade',
 25: 'cafe',
 26: 'waving',
 27: 'precursor',
 28: 'lid',
 29: 'electromagnetic',
 30: 'joshua',
 31: 'regulate',
 32: 'markers',
 33: 'approval',
 34: 'software',
 35: 'fronts',
 36: "tom's",
 37: 'prozac',
 38: 'harmony',
 39: 'mud',
 40: 'coastal',
 41: 'exploitation',
 42: 'tickets',
 43: 'happens',
 44: 'blinded',
 45: 'gothic',
 46: 'washes',
 47: 'prize',
 48: 'survives',
 49: 'stands',
 50: 'hurricane',
 51: 'forming',
 52: 'honey',
 53: 'recycling',
 54: 'stretches',
 55: 'butt',
 56: 'disaster',
 57: 'distort',
 58: 'performing',
 59: 'existence',
 60: 'davis',
 61: 'accused',
 62: 'spike',
 63: 'laborat

In [13]:
# Create Model Inputs i.e tensorflow placeholders
def get_model_inputs():
    '''
    Return : input_, target, learning_rate
    '''
    
    input_ = tf.placeholder(dtype=tf.int32,shape=(None,None),name="input")
    target = tf.placeholder(dtype=tf.int32,shape=(None,None),name="target")
    learning_rate = tf.placeholder(dtype=tf.float32,name="learning_rate")
    
    return input_, target, learning_rate

In [14]:
def convert_text_to_int(talks, vocab_to_int):
    '''
    Params : talks, vocab_to_int
    Return : text_to_int
    '''
    text_to_int = []
    for talk in talks:
        for token in talk:
            if token in vocab_to_int:
                text_to_int.append(vocab_to_int[token])
    
    return np.array(text_to_int)

In [15]:
def get_batches(talks, vocab_to_int, batch_size, seq_length):
    '''
    Param : source, target
    Return : batches
    '''
    source = convert_text_to_int(talks, vocab_to_int)
    batch_count = len(source)//(batch_size * seq_length)
    
    source = source[:(batch_size * seq_length * batch_count)]
    target = np.zeros_like(source)
    
    target[:-1] = source[1:]
    target[-1] = source[0]
    
    batches = []
    source_reshaped = np.reshape(source,(batch_size,-1))
    target_reshaped = np.reshape(target,(batch_size,-1))
    
    for i in range(0,source_reshaped.shape[1],seq_length):
        input_ = source_reshaped[:,i:i+seq_length]
        target = target_reshaped[:,i:i+seq_length]
        batches.append((input_,target))    
        
    return np.array(batches)

In [16]:
batches = get_batches(tokenized_talks, vocab_to_int, 500, 10)

In [17]:
batches.shape

(921, 2, 500, 10)

In [18]:
def build_cell(batch_size,rnn_size,keep_prob,num_layer):
    
    def get_lstm(rnn_size,keep_prob):
        gru = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        drop = tf.contrib.rnn.DropoutWrapper(cell=gru,input_keep_prob=keep_prob)
        return drop
    
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size,keep_prob) for _ in range(num_layer)])
    
    cell_state = cell.zero_state(batch_size, tf.float32)
    
    initial_state = tf.identity(cell_state,name="initial_state")
    print("build_cell -> ", initial_state)
    return cell, initial_state

In [19]:
# Build Model
def build_model(embed, cell, rnn_size, keep_prob=0.5, num_layer=5):
    '''
    Params : embed, cell, rnn_size, keep_prob=0.5, num_layer=5
    Return : output
    '''
    outputs, final_state = tf.nn.dynamic_rnn(cell=cell, inputs=embed, dtype=tf.float32)
    final_state = tf.identity(final_state,name="final_state")
    return outputs, final_state

In [20]:
def get_embed(input_data, vocab_size, embed_dim):
    '''
    Params : input_data, vocab, embed_dim
    Return : embed
    '''
    embedings = tf.Variable(tf.truncated_normal(shape=(vocab_size,embed_dim),stddev=0.1),name="embeding")
    embed = tf.nn.embedding_lookup(embedings,input_data)
    return embed
    

In [21]:
def build_network(cell, input_data, rnn_size, keep_prob, num_layers, vocab_size):
    '''
    Params : cell, input_data, rnn_size, keep_prob, num_layers, vocab_size)
    Return : logits, final_state
    '''
    embed = get_embed(input_data,vocab_size,embed_dim)
    outputs, final_state = build_model(embed,cell,rnn_size,keep_prob,num_layers)
    logits = tf.contrib.layers.fully_connected(outputs,vocab_size,activation_fn=None)
    return logits, final_state

In [22]:
# Number of Epochs
num_epochs = 20
# Batch Size
batch_size = 2048
# RNN Size
rnn_size = 256
#Number of layers
num_layers = 2
# Embedding Dimension Size
embed_dim = 200
# Sequence Length
seq_length = 20
# Learning Rate
lr = 0.01
#Dropout Prob
keep_prob = 0.5
# Show stats for every n number of batches
show_every_n_batches = 10

save_dir = './save'

In [23]:
len(raw_talks)

2000

In [24]:
# Build Graph
train_graph = tf.Graph()
with train_graph.as_default():
    
    vocab_size = len(int_to_vocab)
    
    input_, target, learning_rate = get_model_inputs()
    
    input_data_shape = tf.shape(input_)
    
    cell, initial_state = build_cell(batch_size=input_data_shape[0],rnn_size=rnn_size, 
                                     keep_prob=keep_prob, num_layer=num_layers)
    
    logits, final_state = build_network(cell=cell, input_data=input_,rnn_size=rnn_size,
                                        keep_prob=keep_prob,num_layers=num_layers,
                                        vocab_size=vocab_size)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits,name="probs")
    
    # Loss function
    masks = tf.ones([input_data_shape[0], input_data_shape[1]])
    cost = tf.contrib.seq2seq.sequence_loss(logits,target,masks)
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

build_cell ->  Tensor("initial_state:0", shape=(2, 2, ?, 256), dtype=float32)


In [25]:
#Training Network

batches = get_batches(tokenized_talks, vocab_to_int, batch_size, seq_length)
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(num_epochs):
        print(initial_state)
        state = sess.run(initial_state,feed_dict={input_:batches[0][0]})
        
        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_: x,
                target: y,
                initial_state:state,
                learning_rate: lr}
            train_loss, state, _ = sess.run([cost,final_state,train_op], feed)
            
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                    print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                        epoch_i,
                        batch_i,
                        len(batches),
                        train_loss))
            
    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Tensor("initial_state:0", shape=(2, 2, ?, 256), dtype=float32)
Epoch   0 Batch    0/112   train_loss = 9.511
Epoch   0 Batch   10/112   train_loss = 6.528
Epoch   0 Batch   20/112   train_loss = 6.413
Epoch   0 Batch   30/112   train_loss = 6.373
Epoch   0 Batch   40/112   train_loss = 6.336
Epoch   0 Batch   50/112   train_loss = 6.333
Epoch   0 Batch   60/112   train_loss = 6.322
Epoch   0 Batch   70/112   train_loss = 6.316
Epoch   0 Batch   80/112   train_loss = 6.280
Epoch   0 Batch   90/112   train_loss = 6.229
Epoch   0 Batch  100/112   train_loss = 6.178
Epoch   0 Batch  110/112   train_loss = 6.158
Tensor("initial_state:0", shape=(2, 2, ?, 256), dtype=float32)
Epoch   1 Batch    8/112   train_loss = 6.113
Epoch   1 Batch   18/112   train_loss = 6.090
Epoch   1 Batch   28/112   train_loss = 6.037
Epoch   1 Batch   38/112   train_loss = 6.003
Epoch   1 Batch   48/112   train_loss = 5.983
Epoch   1 Batch   58/112   train_loss = 5.936
Epoch   1 Batch   68/112   train_loss = 5.887


Epoch  14 Batch   12/112   train_loss = 4.671
Epoch  14 Batch   22/112   train_loss = 4.677
Epoch  14 Batch   32/112   train_loss = 4.656
Epoch  14 Batch   42/112   train_loss = 4.655
Epoch  14 Batch   52/112   train_loss = 4.640
Epoch  14 Batch   62/112   train_loss = 4.668
Epoch  14 Batch   72/112   train_loss = 4.659
Epoch  14 Batch   82/112   train_loss = 4.672
Epoch  14 Batch   92/112   train_loss = 4.655
Epoch  14 Batch  102/112   train_loss = 4.654
Tensor("initial_state:0", shape=(2, 2, ?, 256), dtype=float32)
Epoch  15 Batch    0/112   train_loss = 4.642
Epoch  15 Batch   10/112   train_loss = 4.631
Epoch  15 Batch   20/112   train_loss = 4.654
Epoch  15 Batch   30/112   train_loss = 4.666
Epoch  15 Batch   40/112   train_loss = 4.637
Epoch  15 Batch   50/112   train_loss = 4.634
Epoch  15 Batch   60/112   train_loss = 4.643
Epoch  15 Batch   70/112   train_loss = 4.641
Epoch  15 Batch   80/112   train_loss = 4.643
Epoch  15 Batch   90/112   train_loss = 4.635
Epoch  15 Batch  

### Testing

In [26]:
import random
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    # TODO: Implement Function
    
    probabilities = np.argsort(probabilities,axis=None)[-10:]
#     print(probabilities)
    return int_to_vocab[random.choice(probabilities)]

In [27]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    inputs = loaded_graph.get_tensor_by_name('input:0')
    init_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    return inputs, init_state, final_state, probs

In [28]:
word = "in"
# In the next
gen_length = 500

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph("save.meta")
    loader.restore(sess, tf.train.latest_checkpoint('./'))
    
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)
    
    # Sentences generation setup
    gen_sentences = [word]
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])
        
        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)
        gen_sentences.append(pred_word)
        
    final_talk = " ".join(gen_sentences)
    
    print(final_talk)

INFO:tensorflow:Restoring parameters from ./save
in , and this was the story in which it would actually get this on and on the streets and make them a little way on the left place . and it can be the , so that i would say that the word could become , and then that i thought we could make them to get to our and more . now there's not some point of the future we need to have an . we can get this , that we can do this for all kinds about . i can look back with one , you need a look for you from this in this country to this point . you need , but the way you're not really interested from " " what ? the world was . the only one that they would go , but it's very interesting at their work or even the best way ? how much are people doing , so they can do their work and not ? how they would . and then what happens , the way i think is to do this because it would take the future of it ? what do we have this kind ? we know the future has been a little sticker , we know about . but i was really e