In [1]:
import numpy as np
import pandas as pd

## Preprocess the data
Data from [kaggle](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data)

## Load the data

In [2]:
text = pd.read_csv('data/simpsons_script_lines.csv', error_bad_lines=False)['raw_text']
text = text.str.cat(sep='\n\n')

b'Skipping line 8084: expected 13 fields, saw 20\nSkipping line 52607: expected 13 fields, saw 21\nSkipping line 59910: expected 13 fields, saw 21\n'
b'Skipping line 71801: expected 13 fields, saw 20\nSkipping line 73539: expected 13 fields, saw 21\nSkipping line 77230: expected 13 fields, saw 21\nSkipping line 78953: expected 13 fields, saw 21\nSkipping line 81138: expected 13 fields, saw 20\nSkipping line 86746: expected 13 fields, saw 22\nSkipping line 101154: expected 13 fields, saw 21\nSkipping line 115438: expected 13 fields, saw 20\nSkipping line 117573: expected 13 fields, saw 22\nSkipping line 130610: expected 13 fields, saw 22\n'
b'Skipping line 152970: expected 13 fields, saw 22\nSkipping line 153017: expected 13 fields, saw 20\nSkipping line 153018: expected 13 fields, saw 30\nSkipping line 154080: expected 13 fields, saw 20\nSkipping line 154082: expected 13 fields, saw 20\nSkipping line 154084: expected 13 fields, saw 20\nSkipping line 154086: expected 13 fields, saw 20\n

# Tokenize

In [3]:
# chars '[' and ']' will be exclusively for tokens
text = text.replace('[', '(').replace(']', ')')

## Tokenize characters 

In [4]:
# characters = pd.read_csv('data/simpsons_characters.csv')[['name', 'normalized_name']]
# characters = characters.set_index('name')
# characters = characters.str.replace(' ', '_')
# characters = characters[characters.index != characters.values]

# print('Before:', text[:100])
# for key, token in characters.items():
#     text = text.replace(key, token)
# print('After:',text[:100])

## Tokenize the capital letters

In [5]:
# import string
# token = '[capital]'

# letters = string.ascii_uppercase
# for letter in letters:
#     text = text.replace(letter, token + letter.lower())
text = text.lower()

### Tokenize punctuation
The script is split into a word array using spaces as delimiters. Punctuations are replaced with a token to help the neural network to distinguish between the words like "bye" and "bye!". Characters are replaced with their normalized name.

In [6]:
# tokens for delimiters
tokens = {
    '.': '[period]',
    ',': '[comma]',
    '"': '[quotation_mark]',
    "'": '[apostrophe]',
    ':': '[colon]',
    ';': '[semicolon]',
    '!': '[exclamation_mark]',
    '?': '[question_mark]',
    '(': '[left_parentheses]',
    ')': '[right_parentheses]',
    '-': '[dash]',
    '/': '[fslash]',
    '\\': '[bslash]',
    '\n': '[return]',
    '\t': '[tab]',
    ' ': '[space]',
}
# tokens for characters form the show

# will add locations later
# locations = pd.read_csv('data/simpsons_locations.csv')['normalized_name']

print('Before:', text[:100])
for key, token in tokens.items():
    text = text.replace(key, token)
print('After:',text[:100])

Before: miss hoover: no, actually, it was a little of both. sometimes when a disease is in all the magazines
After: miss[space]hoover[colon][space]no[comma][space]actually[comma][space]it[space]was[space]a[space]litt


In [9]:
import pickle
from collections import Counter
import re

text_list = re.compile("(\\[.*?\\])").split(text)
# text_list will have empty results so better filter it
text_list = list(filter(None, text_list))

words_count = pd.Series(Counter(text_list))
print('Unique words before filter: ', len(words_count))
# filter non common words to make smallar model
words_count = words_count[words_count > 10]
print('Unique words after filter: ', len(words_count))

words = list(words_count.index)
ids = range(len(words))

word_to_id = dict(zip(words, ids))
id_to_word = dict(zip(ids, words))

text_with_ids = []
for word in text_list:
    if word not in word_to_id: continue
    text_with_ids.append(word_to_id[word])

pickle.dump((text_with_ids, word_to_id, id_to_word, tokens), open('data/preprocess.p', 'wb'))

Unique words before filter:  41833
Unique words after filter:  8493
