In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [None]:
# Padding sequences
max_length = max([len(x) for x in train_sequences])  # Find the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [None]:
# Inspect padded sequences
print("Sample padded train sequence:\n", train_padded[0])

Sample padded train sequence:
 [   1  168 3003 3986 4515 9420 3276 1671  422  555  278  113    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [None]:
print("Sample padded test sequence:\n", test_padded[0])

Sample padded test sequence:
 [1724  932 8336   15  282 2252   34  216 4849    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [None]:
# Print dimensions of padded sequences
print(f'shape of padded sequences: {train_padded.shape}')

shape of padded sequences: (22802, 106)


In [None]:
# One-hot encoding
train_one_hot = np.zeros((len(train_padded), max_length, vocab_size), dtype=np.float32)
for i, seq in enumerate(train_padded):
    for j, index in enumerate(seq):
        if index != 0:  # Skip padding
            train_one_hot[i, j, index] = 1.0

test_one_hot = np.zeros((len(test_padded), max_length, vocab_size), dtype=np.float32)
for i, seq in enumerate(test_padded):
    for j, index in enumerate(seq):
        if index != 0:  # Skip padding
            test_one_hot[i, j, index] = 1.0

In [None]:
# Inspect one-hot encoded sequences
print("Shape of train_one_hot:", train_one_hot.shape)  #Prints the shape of the one-hot encoded training set to verify the dimensions.
print("Shape of test_one_hot:", test_one_hot.shape)    #Prints the shape of the one-hot encoded test set to verify the dimensions.
print("Sample one-hot encoded train sequence:\n", train_one_hot[0])     #Prints the first one-hot encoded sequence in the training set.
print("Sample one-hot encoded test sequence:\n", test_one_hot[0])       #Prints the first one-hot encoded sequence in the test set.

#The first dimension corresponds to the number of sequences.
#The second dimension corresponds to the positions within each sequence (up to max_length).
#The third dimension corresponds to the vocabulary, with each position in the sequence having a one-hot encoded vector representing the word at that position.


Shape of train_one_hot: (22802, 106, 25734)
Shape of test_one_hot: (5701, 106, 25734)
Sample one-hot encoded train sequence:
 [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Sample one-hot encoded test sequence:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
