In [1]:
import numpy as np
import pandas as pd
from tokenizer import *

In [2]:
ls data

[0m[01;32mpreprocess.p[0m*             [01;32msimpsons_episodes.csv[0m*   [01;32msimpsons_script_lines.csv[0m*
[01;32msimpsons_characters.csv[0m*  [01;32msimpsons_locations.csv[0m*  vectors.h5


## Preprocess the data
Data from [kaggle](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data)

## Load the data

In [3]:
text = pd.read_csv('data/simpsons_script_lines.csv', error_bad_lines=False)['raw_text']
text = text.str.cat(sep='\n\n')

b'Skipping line 8084: expected 13 fields, saw 20\nSkipping line 52607: expected 13 fields, saw 21\nSkipping line 59910: expected 13 fields, saw 21\n'
b'Skipping line 71801: expected 13 fields, saw 20\nSkipping line 73539: expected 13 fields, saw 21\nSkipping line 77230: expected 13 fields, saw 21\nSkipping line 78953: expected 13 fields, saw 21\nSkipping line 81138: expected 13 fields, saw 20\nSkipping line 86746: expected 13 fields, saw 22\nSkipping line 101154: expected 13 fields, saw 21\nSkipping line 115438: expected 13 fields, saw 20\nSkipping line 117573: expected 13 fields, saw 22\nSkipping line 130610: expected 13 fields, saw 22\n'
b'Skipping line 152970: expected 13 fields, saw 22\nSkipping line 153017: expected 13 fields, saw 20\nSkipping line 153018: expected 13 fields, saw 30\nSkipping line 154080: expected 13 fields, saw 20\nSkipping line 154082: expected 13 fields, saw 20\nSkipping line 154084: expected 13 fields, saw 20\nSkipping line 154086: expected 13 fields, saw 20\n

In [4]:
text[8000:][100:1000]

"'ve invited us to their homes.\n\nLisa Simpson: But Mom, I want to hear the witty banter of sophisticated adults.\n\nBart Simpson: Yeah, you can't have any fun in bed.\n\nHomer Simpson: (KNOWING CHUCKLE) Oh son, when you're older, you'll know better.\n\nHomer Simpson: Hmmm. (SMACKS HIS LIPS) Oh, baby! Mmmm. Yeah.\n\nMarge Simpson: (FLUSTERED) Oh! They're here! How does everything look?\n\nHomer Simpson: How do I look?\n\nMarge Simpson: Do we have enough glasses?\n\nHomer Simpson: Do we have enough gag ice cubs?\n\nMarge Simpson: Homer! Homer! Put a record on!\n\nHomer Simpson: What are all our friends names again?\n\nMarge Simpson: Children! Go!\n\nNed Flanders: Hey, anybody mind if I serve as bartender? You know, I have a Ph.D in Mixology. (LAUGHS)\n\nMoe Szyslak: (UNDER BREATH) College boy.\n\nNed Flanders: Hey, Homer! Care to try some of my Flanders Planters punch?\n\nHomer Simpson: Why not? I paid for it.\n\nHomer "

# Tokenize

In [5]:
print('Before:', text[:100])
text = text_to_tokens(text)
print('After:', text[:100])

Before: Miss Hoover: No, actually, it was a little of both. Sometimes when a disease is in all the magazines
After: ['[capital]', 'miss', '[space]', '[capital]', 'hoover', '[colon]', '[space]', '[capital]', 'no', '[comma]', '[space]', 'actually', '[comma]', '[space]', 'it', '[space]', 'was', '[space]', 'a', '[space]', 'little', '[space]', 'of', '[space]', 'both', '[period]', '[space]', '[capital]', 'sometimes', '[space]', 'when', '[space]', 'a', '[space]', 'disease', '[space]', 'is', '[space]', 'in', '[space]', 'all', '[space]', 'the', '[space]', 'magazines', '[space]', 'and', '[space]', 'all', '[space]', 'the', '[space]', 'news', '[space]', 'shows', '[comma]', '[space]', 'it', '[apostrophe]', 's', '[space]', 'only', '[space]', 'natural', '[space]', 'that', '[space]', 'you', '[space]', 'think', '[space]', 'you', '[space]', 'have', '[space]', 'it', '[period]', '[return]', '[return]', '[capital]', 'lisa', '[space]', '[capital]', 'simpson', '[colon]', '[space]', '[left_parentheses]', '[ca

## Create word ids

In [6]:
import pickle
from collections import Counter
import spacy

words_count = pd.Series(Counter(text))
print('Unique words before filter: ', len(words_count))

# filter non common words to make smallar model
words_count = words_count[words_count > 5]
print('Unique words after filter: ', len(words_count))

words = list(words_count.index)
words.append('not_in_vocab')

ids = range(len(words))

word_to_id = dict(zip(words, ids))
id_to_word = dict(zip(ids, words))

en = spacy.load('en_core_web_md')

no_vect_counter = 0
vectors = {}
for word in words:
    word_id = word_to_id[word]
    if word in token_to_value:
        word = token_to_value[word]
    if en.vocab[word].has_vector:
        vectors[word_id] = en.vocab[word].vector
    else:
        no_vect_counter += 1
        vec = np.random.uniform(-2, 2, 300)
        vectors[word_id] = vec

vectors = pd.DataFrame(vectors).T.sort_index()

text_with_ids = []
for word in text:
    if word not in word_to_id:
        word = 'not_in_vocab'
    text_with_ids.append(word_to_id[word])

assert len(vectors) == len(words)

vectors.to_hdf(key='data', path_or_buf='data/vectors.h5')
pickle.dump((text_with_ids, word_to_id, id_to_word), open('data/preprocess.p', 'wb'))

Unique words before filter:  41818
Unique words after filter:  12664
