In [1]:
# load packages
import os
import sys
import string
import numpy as np

# One-hot encoding of characters
----

One-hot encoding is the most common, most basic way to turn a token into a vector. It consists in associating a unique integer index to every character, then 
turning this integer index i into a binary vector of size N, the size of the vocabulary, that would be all-zeros except for the i-th 
entry, which would be 1.

In [2]:
# sample set to work with
samples = ["The cat sat on the mat.", "The dog ate my homework.", "I am writing code in Python."]

In [3]:
characters = string.printable # get all ascii characters
len(characters)

100

In [4]:
characters[0], characters[10], characters[-1]

('0', 'a', '\x0c')

In [5]:
# create a mapping for each character
token_index = dict(zip(characters, range(len(characters))))
len(token_index)

100

In [6]:
max_len = 100 # max number of characters allowed for each word

## Character embedding

In [7]:
# create character embedding
one_hot_encoded = np.zeros(shape=(len(samples), max_len, len(token_index))) # numpy array to store character embedding
for i, sample in enumerate(samples):
    # for each sample, acces each character
    for j, char in enumerate(sample):
        # get the index of the character
        encoding_value = token_index[char]
        # assign index value to that character
        one_hot_encoded[i, j, encoding_value] = 1

In [8]:
"number of samples, number total characters allowed per sample, char embedding dim = ", one_hot_encoded.shape

('number of samples, number total characters allowed per sample, char embedding dim = ',
 (3, 100, 100))

In [9]:
one_hot_encoded[0][1] # first samples, first char

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [10]:
one_hot_encoded[0][2] # first samples, second char

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Thus, you have your character embedding.