In [1]:
#################################################################################################
################################# Building a ChatBot with Deep NLP.##############################
#################################################################################################

In [2]:
import numpy as np
import tensorflow as tf
import re
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])


In [3]:
# Importing the datasets.

lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [4]:
lines

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?',
 'L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".',
 'L867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?',
 "L866 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I figured yo

In [5]:
conversations

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L367', 'L368']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L401', 'L402', 'L403']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L404', 'L405', 'L406', 'L407']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L575', 'L576']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L577', 'L578']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L662', 'L663']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L693', 'L69

In [6]:
# Creating a dictionary that maps each line and it's id.

id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [7]:
id2line

{'L406427': "But, my dear Duroc, how can I act without guidance from Vienna?  I haven't the slightest idea of how the Emperor Francis might feel about this.",
 'L128913': 'Three seconds.  One...',
 'L259432': "Hey, it's me.",
 'L137485': 'Forget it.  If that charge goes off before we can reach it, this whole damn crevice will slam shut on us.  This way.',
 'L14518': 'No not yet Agnes, very soon but not yet. How did the baby get into you?',
 'L151290': 'You look different. I mean you look good.',
 'L263037': "No, he hasn't.  If I offended you, I'm sorry.",
 'L227848': "Yeah, I guess. I'm just a little fucked up.",
 'L34286': 'Really?  You seriously want to go with me?',
 'L34580': 'Hey.  I got another question for you.',
 'L463430': 'You could start over again with cash like that.',
 'L321864': 'Now what?',
 'L520924': 'Terribly sorry.',
 'L420426': "This is just what they want, Buddy. Don't you see it?  They want to drive us apart.  To beat us.  We can't let them do it.  We've been thr

In [8]:
# Creating a list of all of the Conversations.

conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [9]:
conversations_ids

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366'],
 ['L367', 'L368'],
 ['L401', 'L402', 'L403'],
 ['L404', 'L405', 'L406', 'L407'],
 ['L575', 'L576'],
 ['L577', 'L578'],
 ['L662', 'L663'],
 ['L693', 'L694', 'L695'],
 ['L696', 'L697', 'L698', 'L699'],
 ['L860', 'L861'],
 ['L862', 'L863', 'L864', 'L865'],
 ['L866', 'L867', 'L868', 'L869'],
 ['L870', 'L871', 'L872'],
 ['L924', 'L925'],
 ['L984', 'L985'],
 ['L1044', 'L1045'],
 ['L49', 'L50', 'L51'],
 ['L571', 'L572', 'L573'],
 ['L579', 'L580'],
 ['L595', 'L596', 'L597'],
 ['L598', 'L599', 'L600'],
 ['L659', 'L660'],
 ['L952', 'L953'],
 ['L394', 'L395'],
 ['L396', 'L397'],
 ['L589', 'L590', 'L591'],
 ['L592', 'L593'],
 ['L756', 'L757', 'L758'],
 ['L759', 'L760'],
 ['L164', 'L165'],
 ['L319', 'L320'],
 ['L441', 'L442', 'L443', 'L444', 'L445']

In [10]:
# Getting separatelly the Questions and the Answers

questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [11]:
questions

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [12]:
answers

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

In [13]:
questions[1000]

'Turn that off!  Get the bags.'

In [14]:
answers[1000]

'Why should I carry your bag?  I am not a dog.'

In [15]:
# Performing first cleaning of the text

def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text  

In [16]:
# Cleaning the Questions

clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [17]:
# Cleaning the Answers

clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [18]:
clean_questions

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 "no no it's my fault  we didn't have a proper introduction ",
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend',
 "c'esc ma tete this is my head",
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out

In [19]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do',
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone",
 "that is because it's such a nice one",
 'forget french',
 "well there's someone i think might be ",
 'where',
 "i counted on you to help my cause 

In [20]:
# Creating a dictionary that maps each word to its number of occurences.

word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [21]:
word2count

{'brings': 132,
 "defendin'": 1,
 "woulding'": 2,
 'restructure': 2,
 'arthurs': 1,
 'oppose': 9,
 'pitcher': 21,
 'descent!': 1,
 'pangs': 3,
 'hyphenated': 2,
 'doit': 4,
 'victor': 132,
 "mustn't!": 3,
 'snapped': 24,
 'meris': 1,
 'happier': 49,
 'egg': 92,
 'beatty': 3,
 'scanproof': 1,
 "burns'": 5,
 'sayhow': 1,
 'gauche': 2,
 'moray': 5,
 'tired': 648,
 'farty!': 1,
 'truckcab': 4,
 'psychologists': 6,
 'lamppost': 2,
 'italian': 141,
 'suburbia': 2,
 'receice': 1,
 'ss': 14,
 'politiku': 1,
 'gophers': 2,
 'mentioned': 180,
 'peek': 13,
 'neptune': 9,
 'moodslime': 1,
 'bawdy': 1,
 'cannons!': 2,
 'ensured': 2,
 'fragrances': 4,
 'margaret!': 1,
 'selina': 39,
 'manuring': 2,
 'car!': 82,
 'terry': 121,
 'slug': 20,
 'butbut': 16,
 'finance': 43,
 'badand': 1,
 'oopsy': 1,
 'tumbleweed': 3,
 'cranes': 15,
 'grand!': 7,
 'thethe': 12,
 'diversions': 2,
 'humblest': 2,
 'offensive!': 1,
 'muffin': 27,
 'thoughtthey': 1,
 'mrkastle': 1,
 'watcha': 11,
 'memorable': 11,
 'memorial

In [22]:
# Creating two dictionaries that map the questions words and the answers words to a unique intiger.

threshold_questions = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
threshold_answers = 20
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [23]:
questionswords2int

{'comb': 7702,
 'brings': 0,
 'linked': 6652,
 'fought': 7094,
 'across': 3723,
 'pitcher': 1,
 'watergate': 7868,
 'snakes': 7704,
 'legally': 6654,
 'grandfather': 1150,
 'holiday': 6842,
 'horny': 6388,
 'victor': 2,
 'tiger': 8203,
 'immigration': 4357,
 'snapped': 3,
 'monkey': 3380,
 'fed': 1151,
 'happier': 4,
 'egg': 5,
 'provided': 5539,
 'outta': 3381,
 'gwen': 4358,
 'technically': 1152,
 'nick!': 3382,
 'la': 3383,
 'thirsty': 2269,
 'goose': 7903,
 'borrowed': 1153,
 'tired': 6,
 'claim': 1906,
 'slob': 1154,
 'personnel': 5541,
 'code': 3384,
 'mindless': 7707,
 'italian': 7,
 'customers': 4360,
 'ankle': 5543,
 'psychic': 1155,
 'norm': 7708,
 'costs': 852,
 'mentioned': 8,
 "stayin'": 4363,
 'reality': 6681,
 'uncomfortable': 2270,
 "year's": 2301,
 'tricky': 6686,
 'information': 2271,
 'jeanne': 6689,
 'bull': 4364,
 'venice': 1156,
 'selina': 9,
 'car!': 10,
 'terry': 11,
 'slug': 12,
 'jets': 4365,
 'slavery': 4366,
 'finance': 13,
 'prepared': 7711,
 'bargain': 436

In [24]:
answerswords2int

{'comb': 7702,
 'brings': 0,
 'linked': 6652,
 'fought': 7094,
 'across': 3723,
 'pitcher': 1,
 'watergate': 7868,
 'snakes': 7704,
 'legally': 6654,
 'grandfather': 1150,
 'holiday': 6842,
 'horny': 6388,
 'victor': 2,
 'tiger': 8203,
 'immigration': 4357,
 'snapped': 3,
 'monkey': 3380,
 'fed': 1151,
 'happier': 4,
 'egg': 5,
 'provided': 5539,
 'outta': 3381,
 'gwen': 4358,
 'technically': 1152,
 'nick!': 3382,
 'la': 3383,
 'thirsty': 2269,
 'goose': 7903,
 'borrowed': 1153,
 'tired': 6,
 'claim': 1906,
 'slob': 1154,
 'personnel': 5541,
 'code': 3384,
 'mindless': 7707,
 'italian': 7,
 'customers': 4360,
 'ankle': 5543,
 'psychic': 1155,
 'norm': 7708,
 'costs': 852,
 'mentioned': 8,
 "stayin'": 4363,
 'reality': 6681,
 'uncomfortable': 2270,
 "year's": 2301,
 'tricky': 6686,
 'information': 2271,
 'jeanne': 6689,
 'bull': 4364,
 'venice': 1156,
 'selina': 9,
 'car!': 10,
 'terry': 11,
 'slug': 12,
 'jets': 4365,
 'slavery': 4366,
 'finance': 13,
 'prepared': 7711,
 'bargain': 436

In [25]:
# Adding the last tokens to this new dictionaries

tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1   

In [26]:
# Creating the inverse dictionary of the answerwords2int dictionary

answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [27]:
answersints2word

{0: 'brings',
 1: 'pitcher',
 2: 'victor',
 3: 'snapped',
 4: 'happier',
 5: 'egg',
 6: 'tired',
 7: 'italian',
 8: 'mentioned',
 9: 'selina',
 10: 'car!',
 11: 'terry',
 12: 'slug',
 13: 'finance',
 14: 'muffin',
 15: 'pressing',
 16: 'week!',
 17: 'sleepy',
 18: 'yonder',
 19: 'sink',
 20: 'bully',
 21: 'shuffleboard',
 22: 'insane!',
 23: 'tenth',
 24: 'masters',
 25: 'slots',
 26: 'knocking',
 27: 'episode',
 28: 'moscow',
 29: 'pritchett',
 30: 'study',
 31: 'branch',
 32: 'multiple',
 33: 'grateful',
 34: 'nell',
 35: 'fergus',
 36: 'highest',
 37: 'negro',
 38: 'hire',
 39: 'taxi',
 40: 'torture',
 41: 'worthwhile',
 42: 'heaven',
 43: 'facilities',
 44: 'williamson',
 45: 'sort',
 46: 'dangerous',
 47: 'knots',
 48: 'warn',
 49: 'scars',
 50: 'also',
 51: 'mechanic',
 52: 'nightmare',
 53: 'became',
 54: 'argue',
 55: 'survive',
 56: 'inject',
 57: 'cities',
 58: 'engineering',
 59: 'complaints',
 60: 'spacecraft',
 61: 'stake',
 62: 'slowed',
 63: 'devil',
 64: 'protest',
 65:

In [28]:
# Adding the end of string token to the end of every answer

for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'

In [29]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you <EOS>',
 'not the hacking and gagging and spitting part  please <EOS>',
 "okay then how 'bout we try out some french cuisine  saturday  night <EOS>",
 'forget it <EOS>',
 'cameron <EOS>',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does <EOS>',
 'seems like she could get a date easy enough <EOS>',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something <EOS>',
 'that is a shame <EOS>',
 'let me see what i can do <EOS>',
 'right  see  you are ready for the quiz <EOS>',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone <EOS>",
 "that is because it's such a nice one <EOS>",
 'forget french <EOS>',
 "wel

In [30]:
# Translating all the questions and answers into integers
# and replacing all the words that were filtered out by <OUT>

questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints) 

In [31]:
questions_into_int

[[6552,
  3412,
  5101,
  734,
  3160,
  8824,
  8824,
  884,
  8259,
  8824,
  5412,
  1536,
  8376,
  7213,
  8824,
  7375,
  444,
  5658,
  6100,
  6181,
  8824,
  1472],
 [3988,
  1923,
  5146,
  3412,
  3954,
  4563,
  3131,
  8824,
  2894,
  6877,
  1640,
  7113,
  3131,
  8571],
 [6079, 6181, 8824, 884, 8824, 884, 8824, 859, 3818],
 [8571,
  5412,
  4546,
  7152,
  4395,
  6877,
  1640,
  647,
  4840,
  2760,
  1640,
  3815,
  6207,
  1472],
 [693, 693, 2566, 5257, 5870, 3412, 5093, 4155, 4911, 7714, 8824],
 [6020],
 [6181,
  8712,
  1640,
  6020,
  1923,
  4421,
  2685,
  6181,
  4852,
  3551,
  4911,
  6801,
  8824,
  719,
  3551,
  5952,
  5257,
  1463,
  1923,
  1838,
  1343,
  1451,
  5173,
  3638],
 [738],
 [8824,
  7240,
  5173,
  1801,
  297,
  4632,
  1952,
  5653,
  8538,
  5173,
  7410,
  5024,
  3558,
  2482,
  4935,
  4598,
  3895,
  1015,
  5173,
  6942,
  7895,
  3551,
  4935,
  3292,
  973],
 [3095, 2894, 6119, 3412, 4361, 2082, 1643, 4911, 5753],
 [8824, 7866, 8

In [32]:
answers_into_int

[[3988,
  1923,
  5146,
  3412,
  3954,
  4563,
  3131,
  8824,
  2894,
  6877,
  1640,
  7113,
  3131,
  8571,
  8823],
 [6079, 6181, 8824, 884, 8824, 884, 8824, 859, 3818, 8823],
 [7113, 2482, 815, 8790, 3412, 3054, 4395, 3616, 5963, 8824, 5990, 2006, 8823],
 [2092, 4935, 8823],
 [6020, 8823],
 [6181,
  8712,
  1640,
  6020,
  1923,
  4421,
  2685,
  6181,
  4852,
  3551,
  4911,
  6801,
  8824,
  719,
  3551,
  5952,
  5257,
  1463,
  1923,
  1838,
  1343,
  1451,
  5173,
  3638,
  8823],
 [1474, 1015, 5173, 4361, 2680, 4911, 1343, 6931, 8232, 8823],
 [8824,
  7240,
  5173,
  1801,
  297,
  4632,
  1952,
  5653,
  8538,
  5173,
  7410,
  5024,
  3558,
  2482,
  4935,
  4598,
  3895,
  1015,
  5173,
  6942,
  7895,
  3551,
  4935,
  3292,
  973,
  8823],
 [6877, 1640, 4911, 5790, 8823],
 [4956, 7152, 2197, 2760, 1923, 6552, 500, 8823],
 [6492, 2197, 8571, 5412, 6359, 6856, 6181, 8824, 8823],
 [1923,
  7810,
  1711,
  297,
  8717,
  815,
  297,
  4125,
  6877,
  7933,
  1923,
  1711,


In [33]:
# Sorting questions and answers by the length of questions.

sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [34]:
sorted_clean_questions

[[6020],
 [738],
 [7876],
 [6161],
 [6634],
 [693],
 [7226],
 [693],
 [4588],
 [7554],
 [5169],
 [2760],
 [738],
 [6634],
 [738],
 [4883],
 [6479],
 [5972],
 [1203],
 [5604],
 [2760],
 [1350],
 [4588],
 [7113],
 [738],
 [4588],
 [8824],
 [5228],
 [2144],
 [4588],
 [3974],
 [8824],
 [8824],
 [757],
 [815],
 [8824],
 [8824],
 [2760],
 [693],
 [2760],
 [8824],
 [6161],
 [6492],
 [2760],
 [6896],
 [8824],
 [1071],
 [693],
 [693],
 [3788],
 [8824],
 [1621],
 [693],
 [239],
 [8538],
 [5604],
 [693],
 [7760],
 [8454],
 [5604],
 [5016],
 [5016],
 [5016],
 [5016],
 [5604],
 [5878],
 [8268],
 [7113],
 [6359],
 [757],
 [2760],
 [5447],
 [7443],
 [8824],
 [8824],
 [258],
 [5604],
 [220],
 [2760],
 [2760],
 [5604],
 [4307],
 [4307],
 [4307],
 [4307],
 [4307],
 [4307],
 [7113],
 [757],
 [1952],
 [757],
 [239],
 [815],
 [7597],
 [4307],
 [4307],
 [4307],
 [8824],
 [4307],
 [4307],
 [1952],
 [7850],
 [138],
 [4287],
 [5604],
 [6207],
 [5604],
 [8192],
 [5604],
 [2357],
 [5604],
 [3988],
 [7876],
 [276

In [35]:
sorted_clean_answers

[[6181,
  8712,
  1640,
  6020,
  1923,
  4421,
  2685,
  6181,
  4852,
  3551,
  4911,
  6801,
  8824,
  719,
  3551,
  5952,
  5257,
  1463,
  1923,
  1838,
  1343,
  1451,
  5173,
  3638,
  8823],
 [8824,
  7240,
  5173,
  1801,
  297,
  4632,
  1952,
  5653,
  8538,
  5173,
  7410,
  5024,
  3558,
  2482,
  4935,
  4598,
  3895,
  1015,
  5173,
  6942,
  7895,
  3551,
  4935,
  3292,
  973,
  8823],
 [3042, 8823],
 [5315, 1015, 5273, 4841, 4395, 8268, 6511, 8823],
 [8571, 3997, 7100, 734, 5253, 8823],
 [7113, 8571, 5412, 2283, 5079, 297, 176, 815, 297, 6762, 8823],
 [5249, 8123, 8823],
 [8571, 6683, 1280, 8754, 6312, 4935, 8823],
 [1926, 8823],
 [3954, 8571, 875, 6436, 7152, 4911, 6742, 6020, 8823],
 [5257,
  2151,
  8662,
  1923,
  4155,
  6942,
  4911,
  1527,
  5991,
  2685,
  4931,
  6181,
  8824,
  385,
  999,
  4730,
  8823],
 [189, 8824, 6856, 4911, 3970, 8823],
 [4400, 4598, 1015, 4911, 1616, 8656, 8823],
 [108,
  6877,
  1923,
  2232,
  1923,
  3954,
  4883,
  500,
  1093,

In [36]:
########################################## Buildimg the Sec2Sec model ##########################################

In [37]:
# Creating placeholers for the inputs and the targets.
# In Tensorflow all the variables are used in Tensors(Advanced numpy Array). Calling the 4 Tensors.

def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    return inputs, targets, lr, keep_prob

In [38]:
# Preprocessing the targets.
# Adding batch size rows and 1 column with word2int['<SOS>'] value on left side of each batch and removing the right side last value(EOS token).
# fill is to form a new tensor matrix, strided_slice is to extract a subset from a tensor.(upper left to right corner)

def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets

In [39]:
# Creating the Encoder RNN Layer.
# rnn_inputs taken from the model_inputs function.
# rnn_size is number of input tensors of the encoder rnn layer we are making now.
# sequence_length is the list of the lenth of each question in a batch. 
# here encoder_cell is composed of several LSTM RNN Layers with dropout applied on each of them.
# _, encoder_state. this encoder state creates a dynamic version of a bidirectional rnn

def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                    cell_bw = encoder_cell,
                                                                    sequence_length = sequence_length,
                                                                    inputs = rnn_inputs,
                                                                    dtype = tf.float32)

In [40]:
# Decoding the Training set.
# an Embedding is a mapping from discrete objects , such as words to vectors of real numbers.
# decoder only accepts the embedded inputs(decoder_embedded_input).
# output_function is the function used to return the decoder out from the decoder_rnn. 
# attention_states is a three dimentional zero matrix. 
# attention keys are the keys is to be compared with the target states.
# attention values are the values that we will use to construct the context vectors.context is returned by the encoder nad used by the decoder.


def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                              training_decoder_function,
                                                                                                              decoder_embedded_input,
                                                                                                              sequence_length,
                                                                                                              scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

In [41]:
# Decoding the Test / Validation Set.

def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = "attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                                test_decoder_function,
                                                                                                                scope = decoding_scope)
    return test_predictions

In [42]:
# Creating the Decoder RNN

def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           word2int['<SOS>'],
                                           word2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions

In [43]:
# Building the Sec2Sec model.

def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                              answers_num_words + 1,
                                                              encoder_embedding_size,
                                                              initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

In [44]:
# Setting the Hyperparameters.

epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_name_decay = 0.9
min_learning_rate = 0.0001
keep_probability = 0.5

In [45]:
# Defining a session

tf.reset_default_graph()
session = tf.InteractiveSession()

In [46]:
# Loading the model inputs
 
inputs, targets, lr, keep_prob = model_inputs()

In [47]:
# Setting the sequence_length

sequence_length = tf.placeholder_with_default(25, None, name = 'sequence_length')

In [48]:
# Getting the shape of the inputs tensor.

input_shape = tf.shape(inputs)

In [49]:
# Getting the training and test predictions.

training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerswords2int),
                                                       len(questionswords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionswords2int)

TypeError: 'NoneType' object is not subscriptable