# Notes

- two fundamental DL algos for sequence processing are RNNs and 1D Convnets
- vectorising is the process of transforming text into numeric tensors
- the units used to breakdown texts are called tokens
- there are multiple ways to associate a vector with a token, here we use:
    - one-hot encoding
    - token embedding


# One-hot encoding - word level

In [3]:
import numpy as np

#initial data = one entry per sample (here the sample is a sentence)
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

#builds an index of all tokens in the data
token_index = {}
for sample in samples:
    for word in sample.split(): # tokenises the samples by split method
        if word not in token_index: 
            token_index[word] = len(token_index) + 1 # assigns unique index to each unique word - no 0 index
            
max_length = 10 #vectorises samples

results = np.zeros(shape=(len(samples),
                         max_length,
                         max(token_index.values()) +1)) # store results

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i,j,index] = 1.

# One-hot encoding - character level

In [5]:
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable # all printable ascii characters
token_index = dict(zip(range(1, len(characters) +1), characters))

max_length = 50

results = np.zeros((len(samples), max_length, max(token_index.keys()) +1)) # store results
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1. 


# Using Keras for word-level One-hot encoding

In [6]:
from keras.preprocessing.text import Tokenizer

ImportError: No module named keras.preprocessing.text