## Single Document

Tokenisation of a single document in `string`format

In [47]:
# Splits words by space (split=” “).
# Filters out punctuation (filters=’!”#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n’).
# Converts text to lowercase (lower=True).

from keras.preprocessing.text import text_to_word_sequence

# Corpus
corpus = ['With Leonard, Howard, Raj, and Amy accomplishing so much on their respective projects, Sheldon is forced to admit he has nothing important upon which to work.', 
          'He makes Amy leave the apartment for a few days so he can focus, but cannot come up with any ideas and calls his mother as a distraction.',
          'Leonard and Amy have fun recreating experiments from when they were growing up, boring Penny, so she eats with Sheldon as he mulls over his scientific studies.',
          'Penny helps him realize that his study of dark matter is his rebound science from string theory, which Sheldon admits he never truly disregarded, but explaining string theory to her inspires Sheldon, helping him discover a potential breakthrough in the field.',
          'Meanwhile, Howard is too busy with his family to be in the band with Raj, so Raj brings in Bert.',
          'But when Howard annoys Bernadette by writing an astronaut-themed musical while she is on bed rest, she makes him rejoin the band.',
          "The three are poorly received at a Bar mitzvah after singing Bert's original song about the boulder from Raiders of the Lost Ark."]

# Just tokenisation
words = text_to_word_sequence(corpus[0])
print(words)
print(len(words),'\n')

# Estimate size of vocabulary
unique_words = set(text_to_word_sequence(corpus[0]))
print(unique_words)

vocab_size = len(words)
print(vocab_size)

['with', 'leonard', 'howard', 'raj', 'and', 'amy', 'accomplishing', 'so', 'much', 'on', 'their', 'respective', 'projects', 'sheldon', 'is', 'forced', 'to', 'admit', 'he', 'has', 'nothing', 'important', 'upon', 'which', 'to', 'work']
26 

{'respective', 'to', 'important', 'howard', 'their', 'has', 'leonard', 'work', 'on', 'accomplishing', 'amy', 'projects', 'forced', 'nothing', 'admit', 'is', 'with', 'much', 'so', 'and', 'upon', 'which', 'sheldon', 'he', 'raj'}
26


In [48]:
from keras.preprocessing.text import one_hot

# integer encode the document
result = one_hot(corpus[0], round(vocab_size*1.3))
print(result)

[33, 2, 12, 24, 16, 6, 18, 13, 6, 31, 15, 3, 21, 32, 33, 22, 31, 9, 27, 10, 14, 22, 33, 28, 31, 15]


## Multiple Documents

Standard Word Level Tokenisers for multiple documents in a list corpus

In [155]:
# Corpus
corpus = ['With Leonard, Howard, Raj, and Amy accomplishing so much on their respective projects, Sheldon is forced to admit he has nothing important upon which to work.', 
          'He makes Amy leave the apartment for a few days so he can focus, but cannot come up with any ideas and calls his mother as a distraction.',
          'Leonard and Amy have fun recreating experiments from when they were growing up, boring Penny, so she eats with Sheldon as he mulls over his scientific studies.',
          'Penny helps him realize that his study of dark matter is his rebound science from string theory, which Sheldon admits he never truly disregarded, but explaining string theory to her inspires Sheldon, helping him discover a potential breakthrough in the field.',
          'Meanwhile, Howard is too busy with his family to be in the band with Raj, so Raj brings in Bert.',
          'But when Howard annoys Bernadette by writing an astronaut-themed musical while she is on bed rest, she makes him rejoin the band.',
          "The three are poorly received at a Bar mitzvah after singing Bert's original song about the boulder from Raiders of the Lost Ark."]

labels = np.array([0,0,1,1,0,1,1])

### Tokenise Documents (Word Level)

tokeniser in `preprocessing.text` 


In [147]:
'''

GENERATE TOKENS

'''

from keras.preprocessing.text import Tokenizer

t = Tokenizer(oov_token='') # create the tokenizer
t.fit_on_texts(corpus) # fit the tokenizer on the documents

# Dictionary
print(t.word_index,'\n')


'''

GENERATE FEATURE MATRIX
tokenise.text_to_matrix

'''

# ‘binary‘: Whether or not each word is present in the document. This is the default. OHE
# ‘count‘: The count of each word in the document. BOW
# ‘tfidf‘: The Text Frequency-Inverse DocumentFrequency (TF-IDF) scoring for each word in the document.
# ‘freq‘: The frequency of each word as a ratio of words within each document.

encoded_docs = t.texts_to_matrix(corpus, mode='binary')
print(encoded_docs.shape)
encoded_docs[0] # one of the documents 

'''

PADDING
Convert all encoded documents to the same length

'''

# Check length of feature matrix

import numpy as np

encoded_docs = t.texts_to_sequences(corpus)

print('tokenised document data:')
for ii,i in enumerate(encoded_docs):
    print(ii,f'document length: {len(i)}')


from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 4
# Pad documents to a max length of 4 words

padded_docs = pad_sequences(encoded_docs, 
                            maxlen=68, 
                            padding='post')
print(f'\npadded documents:\n {padded_docs}\n')
print(padded_docs.shape)

{'': 1, 'the': 2, 'with': 3, 'he': 4, 'his': 5, 'so': 6, 'sheldon': 7, 'is': 8, 'to': 9, 'a': 10, 'howard': 11, 'raj': 12, 'and': 13, 'amy': 14, 'but': 15, 'from': 16, 'she': 17, 'him': 18, 'in': 19, 'leonard': 20, 'on': 21, 'which': 22, 'makes': 23, 'up': 24, 'as': 25, 'when': 26, 'penny': 27, 'of': 28, 'string': 29, 'theory': 30, 'band': 31, 'accomplishing': 32, 'much': 33, 'their': 34, 'respective': 35, 'projects': 36, 'forced': 37, 'admit': 38, 'has': 39, 'nothing': 40, 'important': 41, 'upon': 42, 'work': 43, 'leave': 44, 'apartment': 45, 'for': 46, 'few': 47, 'days': 48, 'can': 49, 'focus': 50, 'cannot': 51, 'come': 52, 'any': 53, 'ideas': 54, 'calls': 55, 'mother': 56, 'distraction': 57, 'have': 58, 'fun': 59, 'recreating': 60, 'experiments': 61, 'they': 62, 'were': 63, 'growing': 64, 'boring': 65, 'eats': 66, 'mulls': 67, 'over': 68, 'scientific': 69, 'studies': 70, 'helps': 71, 'realize': 72, 'that': 73, 'study': 74, 'dark': 75, 'matter': 76, 'rebound': 77, 'science': 78, 'adm

In [163]:
'''

Classification Model w/ Embedding Layer

'''

vocab_size = 127
max_length = 68

# define the model
print('\nBinary Classification Model\n')
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
print(model.summary())

# # fit the model
print('\nTraining:\n')
model.fit(padded_docs, 
          labels, 
          epochs=5, 
          verbose=1)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels,verbose=1)

print('\nEvaluation\n')
print('Accuracy: %f' % (accuracy*100))



Binary Classification Model

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_37 (Embedding)     (None, 68, 8)             1016      
_________________________________________________________________
flatten_35 (Flatten)         (None, 544)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 545       
Total params: 1,561
Trainable params: 1,561
Non-trainable params: 0
_________________________________________________________________
None

Training:

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Evaluation

Accuracy: 85.714287


### Generating Dataset

If we need to utilise tensorflow `dataset`

In [164]:
import tensorflow as tf
import pandas as pd

ldf = pd.DataFrame(encoded_docs)
ldf.fillna(0,inplace=True)
ldf = ldf.astype('int')

text_encoded = ldf.values
labels = np.array([0,1,1,0,0,0,1])
print(text_encoded.shape)
print(labels.shape)

# tf.convert_to_tensor(encoded_docs)
dataset = tf.data.Dataset.from_tensor_slices((text_encoded,labels))
dataset

(7, 41)
(7,)


<TensorSliceDataset shapes: ((41,), ()), types: (tf.int64, tf.int64)>