## One-hot encoding of characters
----

One-hot encoding is the most common, most basic way to turn a token into a vector. It consists in associating a unique integer index to every character, then 
turning this integer index i into a binary vector of size N, the size of the vocabulary, that would be all-zeros except for the i-th 
entry, which would be 1.

In [1]:
# Load packages
import string
import numpy as np

In [2]:
# Sample data
samples = ["The cat sat on the mat.", "The dog ate my homework.", "I am writing code in Python."]

In [3]:
# Get all ascii character list
characters = string.printable

print("Number of characters = ", len(characters))

Number of characters =  100


In [4]:
# View
characters[0], characters[10], characters[-1]

('0', 'a', '\x0c')

In [5]:
# Create character-index mapping
token_index = dict(zip(characters, range(len(characters))))

print(token_index)

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'q': 26, 'r': 27, 's': 28, 't': 29, 'u': 30, 'v': 31, 'w': 32, 'x': 33, 'y': 34, 'z': 35, 'A': 36, 'B': 37, 'C': 38, 'D': 39, 'E': 40, 'F': 41, 'G': 42, 'H': 43, 'I': 44, 'J': 45, 'K': 46, 'L': 47, 'M': 48, 'N': 49, 'O': 50, 'P': 51, 'Q': 52, 'R': 53, 'S': 54, 'T': 55, 'U': 56, 'V': 57, 'W': 58, 'X': 59, 'Y': 60, 'Z': 61, '!': 62, '"': 63, '#': 64, '$': 65, '%': 66, '&': 67, "'": 68, '(': 69, ')': 70, '*': 71, '+': 72, ',': 73, '-': 74, '.': 75, '/': 76, ':': 77, ';': 78, '<': 79, '=': 80, '>': 81, '?': 82, '@': 83, '[': 84, '\\': 85, ']': 86, '^': 87, '_': 88, '`': 89, '{': 90, '|': 91, '}': 92, '~': 93, ' ': 94, '\t': 95, '\n': 96, '\r': 97, '\x0b': 98, '\x0c': 99}


In [6]:
# Create array to hold character vectors
max_num_chars = 100
one_hot_encoded = np.zeros(shape=(len(samples), max_num_chars, len(token_index)))

print("Shape of character embedding matrix {}".format(one_hot_encoded.shape))

# Iterate over each sample
for i, sample in enumerate(samples):
    # Iterate over each character
    for j, char in enumerate(sample):
        # Get idnex of the character
        idx = token_index[char]
        # Assign value the corresponding index
        one_hot_encoded[i, j, idx] = 1

Shape of character embedding matrix (3, 100, 100)


In [7]:
# View
print(one_hot_encoded[0][1])
print(one_hot_encoded[0][1])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


### Thus, you have your character embedding!