# **Tokenizer**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# **Padding**

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)

In [None]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love my dog',
    'my dog loves my house much'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10,padding='pre')
print("\nPadded Test Sequence: ")
print(padded)

# **Full code with Embeddings**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
import numpy as np

# Step 1: Prepare sentences
sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

# Step 2: Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences,maxlen=5)

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)

# Step 3: Embedding Layer
vocab_size = len(word_index) + 1  # +1 for padding
embedding_dim = 8  # You can change this

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=5))
model.compile('adam', 'mse')

# Step 4: Get embeddings
embeddings = model.predict(padded)

print("\nWord Embeddings:")
print(embeddings.shape)  # (num_sentences, maxlen, embedding_dim)
print("\nEmbedding for first sentence (shape):", embeddings[0].shape)
print("\nEmbedding vector for first word of first sentence:")
print(embeddings[0][0])
