# NLP The Office

Training an NLP model on transcripts of episodes of the office to classify the speaker as either Jim or Dwight. Then, a second NLP model will be used to generate lines for Michael

Steps:

    - WebScrape all episodes of the office
    
    - Use TensorFlow to transform lines into padded sequences amnenable for a Nueral Net
    
    - Train a Classification model to determine the speaker based on a line
    
    - Transform all of Michael's lines for a speech generation model
    
    - Train a prediction model to generate Michael's lines

## Libraries

In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
import urllib3
import shutils
import tqdm
import random
import PIL
import wget

import tensorflow as tf
import numpy as np
import pandas as pd
padder = tf.keras.preprocessing.sequence.pad_sequences


# Scrape Data

In order to train an NLP model, we first need to develop our corpus of text. For that, we will parse through scripts looking for lines from Jim and Dwight. The individual office transcripts are listed across 7 different webapages. The first function urlFinder will be used to generate a url for each individual script from the 7 webpages. The second function buildCorpus will actually build the corpus of text for training.

In [None]:
def urlFinder(urls):
    
    # Initiate list of links
    urlList = []
    base = 'https://transcripts.foreverdreaming.org'
    
    for url in urls:
        
        # Access given url 
        webpage = requests.get(url)
    
        # Use Beautiful Soup to parse through the webpage and make it accesible
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        # Find ever link associated with an episode
        count = 0
        for entry in soup.find_all("a", class_='topictitle'):
        
            # Find the links
            link = entry["href"]
    
            # Add base the of link back in and store it
            link = base + link[1:]
            urlList.append(link)
            
            # Remove first link on each webpage
            if count == 0:
                urlList.pop()
            count+=1
    
    return urlList
    

In [None]:
# List of homepage URLs that link to individual transcript links
urls =["https://transcripts.foreverdreaming.org/viewforum.php?f=574",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=25",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=50",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=75",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=100",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=125",
       "https://transcripts.foreverdreaming.org/viewforum.php?f=574&start=150"]

# From the homepage, find the URL for each individual scripts
urlList = urlFinder(urls)

print(len(urlList))
print(urlList[0])

In [None]:
def buildCorpus(urlList, characters):
    
    # Initiate a list of lines and characters to pull lines for
    lines = []
    
    for url in urlList:
    # Get all the text for a given url
        webpage = requests.get(url)
        soup = BeautifulSoup(webpage.content, "html.parser")
        text = soup.text
    
        # Initialize the while loop by finding the first instance of a labeled character speaking
        startInd = indexFinder(text, characters)
        moreLines = True
    
        # Work through the script until there isn't anymore speach from characters
        while moreLines == True:
        
            # Select the portion of the string where a labeled character is speaking
            text = text[startInd:]
            stopInd = text.find("\n")
            passage = text[:stopInd]
            
            # Remove action dialogue -- Keeping for now
            #passage = actionRemover(passage)
            
            # Split the passage into a label and sentence
            labeledPasssage = passage.split(":")
            lines.append(labeledPasssage)


            # Find next occurence of a labeled speaker
            text = text[stopInd:]
            startInd = indexFinder(text, characters)
    
            # If the helper function doens't find another location of a chracter speaking it will return 1e10
            if startInd == -1:
                moreLines = False
    
    return lines


# Indentify the next time a labeled character speaks
def indexFinder(text, characters):

    # Initialize an impossible index
    index = 1e10
    
    # Cycle through a list of characters
    for character in characters:
        
        # Find the first index, but return a -1, when there isn't an existing instance
        try:
            ind = text.index(character)
        except ValueError:
            ind = -1
        
        # Compare index against all existing values
        if ind < index:
            index = ind
    
    return index

def actionRemover(passage):
    
    # Look for any instances of action dialogue
    stillAction = True
    try:
        ind1 = passage.index("[")
    except ValueError:
        stillAction = False
    
    
    while stillAction:

        ind2 = passage.index("]")
        passage = passage.replace(passage[ind1:ind2], " ")


        try:
            ind = passage.index("[")
        except ValueError:
            stillAction = False
        
    return passage
    

## Save / Load Data

Now that the corpus has been built, let's store it as a pickled file so that it can be loaded again quickly without having to scrape it again

In [None]:
characters = ["Jim:", "Dwight:"]
lines = buildCorpus(urlList, characters)

print(lines[3])
print(lines[9])

In [None]:
import pickle

with open('labeledData.txt', 'wb') as fh:
    pickle.dump(lines, fh)

In [None]:
import pickle

pickledfile = open('labeledData.txt', 'rb')
lines = pickle.load(pickledfile)

print(lines[3])
print(lines[9])

## Train and Test splitting

Splitting the data into a test and train arrays for training.


In [None]:
import random

# Define the test and training split
test_train_split = round(len(lines) * 0.1)

# Shuffle and assign the data
lines = random.sample(lines, len(lines))
train_lines = lines[test_train_split:]
test_lines = lines[:test_train_split]


train_sentences = []
train_labels = []
for i in range(len(train_lines)):
    try:
        train_sentences.append(train_lines[i][1])
        train_labels.append(train_lines[i][0])
    except IndexError:
        pass

test_sentences = []
test_labels = []
for i in range(len(test_lines)):
    try:
        test_sentences.append(test_lines[i][1])
        test_labels.append(test_lines[i][0])
    except IndexError:
        pass    

print(train_labels[0])
print(train_sentences[0])

print(len(train_sentences))
print(len(test_sentences))

## Preprocessing Inputs

Before training, the sentences/lines needs to be processed before being fed to the NLP model. We will use the Tokenizer and pad_sequences from tensorflow to create tokenized sequences of equal lenghts. The speaker label will also be tokenized.

In [None]:
# Initialize the tokenizer
max_length = 40
vocab = 80000
oov_token = "<OOV>"
embedding_dim = 100
padding = "post"

padder = tf.keras.preprocessing.sequence.pad_sequences
textTokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab, oov_token=oov_token)

# Process training inputs
textTokenizer.fit_on_texts(train_sentences)
train_sequences = textTokenizer.texts_to_sequences(train_sentences)
train_sequences = np.array(padder(train_sequences, padding="post", maxlen=max_length, truncating="post"))

# Process test inputs
test_sequences = textTokenizer.texts_to_sequences(test_sentences)
test_sequences = np.array(padder(test_sequences, padding="post", maxlen=max_length, truncating="post"))

# Process lables
labelTokenizer = tf.keras.preprocessing.text.Tokenizer()
labelTokenizer.fit_on_texts(train_labels)
train_labels_seq = np.array(labelTokenizer.texts_to_sequences(train_labels))
test_labels_seq = np.array(labelTokenizer.texts_to_sequences(test_labels))

train_labels_enc = tf.keras.utils.to_categorical(train_labels_seq)
train_labels_enc = train_labels_enc[:,1]
test_labels_enc = tf.keras.utils.to_categorical(test_labels_seq)
test_labels_enc = test_labels_enc[:,1]

print(train_sequences[1])
print(train_labels_seq[1])
                       
print(train_sequences.shape)
print(train_labels_enc.shape)
print(test_sequences.shape)
print(test_labels_enc.shape)

## Data Cleaning

A lot of lines tend to be on the order of 5 or so words, which is challenging for a classifier model to handle. Although there are dead-giveaway words like when Jim calls Dwight by his name or vice-versa, 5 words isn't enough information to accuratley classify a character. shortSentenceRemover will handle removing all sentences that are less than a specified length. Although this greatly reduces the training data, the quality greatly increases.

In [None]:
def shortSentenceRemover(labels, sequences, sentenceLength):
    
    # Run though each entry and check the minimum setenceLength criteria is met
    count = 0
    while count < len(sequences):
            
            #Check if the minLength word space on the count row is filled in, indicating there are at least minLegth words
            if sequences[count, sentenceLength] == 0:
                
                #Delete row and don't progress count
                sequences = np.delete(sequences, (count), axis=0)
                labels = np.delete(labels, (count), axis=0)
                
            else:
                count += 1
    
    return labels, sequences 


In [None]:
minLength = 5
train_labels_enc, train_sequences = shortSentenceRemover(train_labels_enc, train_sequences, minLength)
test_labels_enc, test_sequences = shortSentenceRemover(test_labels_enc, test_sequences, minLength)

print(train_sequences.shape)
print(train_labels_enc.shape)
print(test_sequences.shape)
print(test_labels_enc.shape)

## Define Model

For our classifier, we will use a relatively simple NLP model. I wanted to use the Standford Glove embedding weights, but the webiste wouldn't respond when trying to download the weights. Instead, the embedding layer will train itself. Followed by the embedding layer is a Bidirectional LSTM to handle the transfer of meaning forward and backwards in the sentence. Lastly, two dense layers connect to the output. I have the code written such that the model can operate as a binary and multiclass classifier. If the cell containing the characters list is updated to include more characters, the notebook and model will update the preprocessing and output layer of the model.

In [None]:
XavierInit = tf.keras.initializers.GlorotNormal()

def buildModel(max_length, embedding_dim, vocab, characters):
    
    #Embedding Layer
    sequence = tf.keras.layers.Input(shape = (max_length))
    x = tf.keras.layers.Embedding(vocab+1, embedding_dim, input_length=max_length)(sequence)
    
    # Bidirectional LSTM
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, kernel_initializer=XavierInit))(x)
    
    # Conv1D
    x = tf.keras.layers.Conv1D(128, kernel_size = 3 ,kernel_initializer=XavierInit)(x)
    x = tf.keras.layers.Flatten()(x)

    # Dense Layers
    x = tf.keras.layers.Dense(64, activation = 'relu', kernel_initializer=XavierInit)(x)
    x = tf.keras.layers.Dropout(0.9)(x)
    x = tf.keras.layers.Dense(64, activation = 'relu', kernel_initializer=XavierInit)(x)
    x = tf.keras.layers.Dropout(0.9)(x)


    # Output Layer
    x = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)
    
    model = tf.keras.Model(inputs = sequence, outputs=x)
    
    return model


In [None]:
model = buildModel(max_length, embedding_dim, vocab, characters)
Adam = tf.keras.optimizers.Adam
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.00009), metrics=['accuracy'])

model.summary()

## Training

In [None]:
# Setup callback to stop training if accuracy reaches 95%
class mycallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = {}):
        if (logs.get('accuracy') > 0.95):
            model.self.training = False
            
acc_Callback = mycallback()

# Setup callback to stop training if validation loss stops improving (overfitting sign)
earlyStop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 15, restore_best_weights = True)

history = model.fit(train_sequences, train_labels_enc, epochs = 500, 
                    batch_size = 128, steps_per_epoch = 16,
                    shuffle = True,
                    validation_data=(test_sequences, test_labels_enc),
                    callbacks = [acc_Callback, earlyStop])

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
val_accuracy = history.history['val_accuracy']
epochs = [*range(1,len(acc)+1)]

fig, ax = plt.subplots()
ax.plot(epochs, acc)
ax.plot(epochs,val_accuracy)
fig.legend(["Accuracy" ,"Validation Accuracy"])
ax.set(xlabel="Epochs", ylabel="Percent Correct")

fig, ax = plt.subplots()
ax.plot(epochs, loss)
ax.plot(epochs,val_loss)
fig.legend(["Loss" ,"Validation Loss"])
ax.set(xlabel="Epochs", ylabel="Loss Function")


# Line Generation

Now we will build a model to take a given sequence and predict the next word. When called continuously the model will generate a setence. To ensure there is enough data, we will train the model on Michael, and all of of the webscraping/data preprocessing from before will be helpful

## WebScraping

Pull all of Michael's lines and save them to a pickeled file.

In [None]:
characters = ["Michael:"]
lines = buildCorpus(urlList, characters)

print(lines[3])

import pickle

with open('Michael.txt', 'wb') as fh:
    pickle.dump(lines, fh)

In [None]:
import pickle

pickledfile = open('Michael.txt', 'rb')
label_lines = pickle.load(pickledfile)
lines[3]

## Input Data Structuring

To train this model. All of Michael's lines need to be broken down to include the preceding words as inputs and next word as the label. For example, let's take his infamous line: “If I had a gun with two bullets and I was in a room with Hitler, Bin Laden, and Toby, I would shoot Toby twice.”

This sentence could then be broken up into a number of training examples:

        Input: "If"          Output = "I"
        
        Input: "If I"        Output = "had"
        
        Input: "If I had"    Output = "a"
        
        Input: "If I had a"  Output = "gun"

        etc.


In [None]:
# Delete the existing labels
lines = []
for i in range(len(label_lines)):
    try:
        lines.append(label_lines[i][1])
    except IndexError:
        pass

# Initialize the tokenizer
max_length = 20
vocab = 5000
oov_token = "<OOV>"
embedding_dim = 40
padding = "post"
mikeTokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab, oov_token=oov_token)

# Tokenize the sentences
mikeTokenizer.fit_on_texts(lines)
mikeSequences = mikeTokenizer.texts_to_sequences(lines)


# Break them up into preceding words and labels
input_sequences = []
for line in mikeSequences:
    for i in range(1, len(line)):
        n_sequence = line[:i+1]
        input_sequences.append(n_sequence)

# Pad inputs
input_sequences = np.array(padder(input_sequences, padding="pre", maxlen=max_length+1, truncating="pre"))

# Sort sequences into inputs and labels
inputs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

# Check sizing and scripting are working correclty
print(inputs[0:3])
print(labels[0:3])
print(inputs.shape)
print(labels.shape)

## Model Creation & Training

In [None]:
def buildInferenceModel(vocab, max_length):

    # Input Embedding layer
    sequence = tf.keras.layers.Input(shape = (max_length))
    inputs = tf.keras.layers.Embedding(vocab+1, 100, input_length=max_length)(sequence)
    
    # Bidirectional LSTM
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True))(inputs)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True))(x)
    x = tf.keras.layers.Conv1D(128, kernel_size=5)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    
    # Output
    output = tf.keras.layers.Dense(vocab+1, activation = "softmax")(x)
    
    # Create & return model
    model = tf.keras.Model(sequence,output)
    
    return model

In [None]:
mikeModel = buildInferenceModel(vocab, max_length)
Adam = tf.keras.optimizers.Adam
mikeModel.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.00001), metrics=['accuracy'])

mikeModel.summary()

In [None]:
# Setup callback to stop training if validation loss stops improving (overfitting sign)
earlyStop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 15, restore_best_weights = True)

history = mikeModel.fit(inputs, labels, 
                        shuffle = True, epochs=500, 
                        validation_split =  0.05, callbacks = [earlyStop])

In [None]:
acc = history.history['accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
val_accuracy = history.history['val_accuracy']
epochs = [*range(1,len(acc)+1)]

fig, ax = plt.subplots()
ax.plot(epochs, acc)
ax.plot(epochs,val_accuracy)
fig.legend(["Accuracy" ,"Validation Accuracy"])
ax.set(xlabel="Epochs", ylabel="Percent Correct")

fig, ax = plt.subplots()
ax.plot(epochs, loss)
ax.plot(epochs,val_loss)
fig.legend(["Loss" ,"Validation Loss"])
ax.set(xlabel="Epochs", ylabel="Loss Function")

## Line Generation

With a NLP model that generates the next word in a sequence, we can start a line and let the model finish it.

In [None]:
def mikeTalk(start, model, tokenizer, num_words):
    
    #Format input string into something the model can evalutate
    Sequence = tokenizer.texts_to_sequences(start)
    
    #Cycle the function as generate to the desired word length
    for i in range(num_words):
        
        #Handle input and truncation
        Sequence_input = np.array(padder(Sequence, padding="pre", maxlen=max_length, truncating = "pre")).reshape(1,max_length)
    
        #Predict word
        word_liklihood = model.predict(Sequence_input)
        
        #Remove OOV Token from selected vocabulary
        word_liklihood[-1] = -1

        #Pick word with the highest likelihood
        word = np.argmax(word_liklihood)
        
        #Append word
        Sequence[0].append(word)


    #Return and print the input and output for a line
    print("Input: ")
    print(start)
    print("\n")
    print("Output: ")
    print(tokenizer.sequences_to_texts(Sequence))
    
    return Sequence

In [None]:
num_words = 4
line = mikeTalk(["Dwight and Jim"], mikeModel, mikeTokenizer, num_words)

