<a href="https://colab.research.google.com/github/sanids/hello_world/blob/master/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Keras 
import keras.utils as ku 
# The Layers we will Use
from keras.layers import Embedding, LSTM, Dense, Dropout
# Tokenizer
from keras.preprocessing.text import Tokenizer
# Our model
from keras.models import Sequential
# To address overfitting
from keras.callbacks import EarlyStopping
# To enable sequencing in our data
from keras.preprocessing.sequence import pad_sequences
# Seeds for reproducibility
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)
import re
import pandas as pd
import numpy as np
import string, os


Using TensorFlow backend.


In [0]:
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')


In [0]:
lines[:10]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?']

In [0]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [0]:
convs = [ ]
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

In [0]:
conversations = []
for conv in convs:
    for i in range(len(conv)-1):
        conversations.append(id2line[conv[i]])


In [0]:
print(conversations[0])
print(conversations[1])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.


In [0]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [0]:
clean_conversations = []
for conv in conversations:
  clean_conversations.append(clean_text(conv))

In [0]:
for i in range(0, 3):
    print(clean_conversations[i])

can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
well i thought we would start with pronunciation if that is okay with you
not the hacking and gagging and spitting part  please


In [0]:
# Declare a tokenizer object to use
tokenizer = Tokenizer()


In [0]:
def get_sequences(text):
    # encode our words
    tokenizer.fit_on_texts(text)
    # how many words we have in total ( + 1 because it starts at 0)
    total_words = len(tokenizer.word_index) + 1
    ## convert data to sequence
    sequences = []
    for sentence in text:
        token_sentences = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(1, len(token_sentences)):
            # For each token (word) in our sentence we create an array with the token and its previous tokens
            sequence = token_sentences[:i+1]
            # Add that sequence to our array of sequences
            sequences.append(sequence)
    # Return our total sequences and the total number of words in our text
    return sequences, total_words

In [0]:
token_conv, total_words = get_sequences(clean_conversations[:35000])

In [0]:
print(token_conv[:5])
print(clean_conversations[0])

[[50, 18], [50, 18, 99], [50, 18, 99, 19], [50, 18, 99, 19, 885], [50, 18, 99, 19, 885, 9384]]
can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again


In [0]:
def padded_sequences(sequences):
  # So we extract the max length and use that one. Shorter sequences will just use 0's where they don't have words
    max_sequence_length = max([len(x) for x in sequences])
    # Now we have to reshape our sequences to fit to this new lentgh
    # Thankfully keras has the function pad_sequences that does this
    # We then make it an array by calling np.array(padded_sequences)
    sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_length, padding='pre'))
    
    # Now we split our sequences into data and labels
    # for the phrase "hello new world"
    # we will have the seuqences and labels: 
    # hello -> new
    # hello new -> world
    # Where each label is the next word we are trying to predict based on the sequence
    data = sequences[:,:-1]
    # So our data will be all the words up to the last one
    label = sequences[:,-1]
    # Our label will be our last word
    # We don't want to assign greater importance to certain words just because they have a bigger number
    # So we make them all arrays of 0 and 1. 
    # Each one of our labels will have a specific value
    # i.e, hello can be [0, 0, 0, ..... ,  1] 
    # the length depends on the number of words we can have
    
    label = ku.to_categorical(label, num_classes=total_words)
    return data, label, max_sequence_length

In [0]:
padded_conv, label_conv, max_sequence_length = padded_sequences(token_conv[:30000])

In [0]:
print(padded_conv[0])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 50]


In [0]:
# Declare a sequential model
model = Sequential()
# Add a layer to the model (Embedding) that will allow us to take the inputs
model.add(Embedding(total_words, 10, input_length=max_sequence_length - 1)) # because its not 0-based
# Add an LSTM Layer with 100 units
model.add(LSTM(100))
model.add(Dropout(0.2))
# Add another layer (our output layer) with softmax actiavtion
model.add(Dense(total_words, activation='softmax'))
# Compile model with adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 217, 10)           184970    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 18497)             1868197   
Total params: 2,097,567
Trainable params: 2,097,567
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(padded_conv, label_conv, epochs=100, verbose=2)

Epoch 1/100
 - 217s - loss: 6.5688
Epoch 2/100


In [0]:
def generate_text(seed_text, model, max_sequence_len):
    for _ in range(50):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [0]:
print (generate_text("What is the capital of Peru?", model, max_sequence_length))

NameError: ignored