In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16406291302387452936
xla_global_id: -1
]


### Reference
- https://github.com/WillKoehrsen/recurrent-neural-networks/blob/master/notebooks/Deep%20Dive%20into%20Recurrent%20Neural%20Networks.ipynb

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/mpst_full_data.csv')
data.head()

# extract synopsis
synopsis = data['plot_synopsis']
len(synopsis)

14828

In [3]:
from keras.preprocessing.text import Tokenizer
sampleText = "This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have image and has two commas."
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([sampleText])
s = tokenizer.texts_to_sequences([sampleText])[0]
print(' '.join(tokenizer.index_word[i] for i in s))
tokenizer.word_index.keys()

this is a short sentence 1 with one reference to an image this next sentence while non sensical does not have image and has two commas


dict_keys(['this', 'sentence', 'image', 'is', 'a', 'short', '1', 'with', 'one', 'reference', 'to', 'an', 'next', 'while', 'non', 'sensical', 'does', 'not', 'have', 'and', 'has', 'two', 'commas'])

In [4]:
import re

def format_text(input):
    """Formats the text to treat punctuations"""
    
    # Add spaces around punctuation
    input = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', input)
    
    # remove references to figures
    input = re.sub(r'\((\d+)\)', r'', input)
    
    # remove double spaces
    input = re.sub(r'\s\s', ' ', input)
    
    return input

f = format_text(sampleText)
f

'This is a short sentence with one reference to an image . This next sentence , while non-sensical , does not have image and has two commas .'

In [5]:
tokenizer = Tokenizer(filters='!"#$%&()*+/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([f])
s = tokenizer.texts_to_sequences([f])[0]
print(' '.join(tokenizer.index_word[i] for i in s))
tokenizer.word_index.keys()

this is a short sentence with one reference to an image . this next sentence , while non-sensical , does not have image and has two commas .


dict_keys(['this', 'sentence', 'image', '.', ',', 'is', 'a', 'short', 'with', 'one', 'reference', 'to', 'an', 'next', 'while', 'non-sensical', 'does', 'not', 'have', 'and', 'has', 'two', 'commas'])

In [6]:
def remove_spaces(input):
    """Removes spaces around punctuation"""
    return re.sub(r'\s+([.,;?])', r'\1', input)

remove_spaces(' '.join(tokenizer.index_word[i] for i in s))

'this is a short sentence with one reference to an image. this next sentence, while non-sensical, does not have image and has two commas.'