In [66]:
# Text preprocessing and sequence padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [67]:
#Neural network layers (embedding, input, dense, flatten)
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten

In [68]:
# Model construction (functional API, sequential model)
from tensorflow.keras.models import Model, Sequential

In [69]:
import numpy as np

In [70]:
#tokenizer initiation
tokenizer = Tokenizer()
#Preparing training text
train_text = "The earth is an awesome place live"

In [71]:
#building vocab  = analyzes the training text and assigns a unique integer index to each word (creating the word set)
tokenizer.fit_on_texts([train_text])

In [72]:
#preparing new text for encoding
sub_text = "The earth is an great place live"
sequences = tokenizer.texts_to_sequences([sub_text])[0]

In [73]:
#The reason for using a similar but slightly different text (for example, replacing "awesome" with "great") is to demonstrate how the tokenizer handles words that are not present in the learned vocabulary.

In [74]:
#Integer Encoding -> converts the new sentences into a sequence of integers 
print("integer encoding : ",sequences)
print("vocab : ",tokenizer.word_index)

integer encoding :  [1, 2, 3, 4, 6, 7]
vocab :  {'the': 1, 'earth': 2, 'is': 3, 'an': 4, 'awesome': 5, 'place': 6, 'live': 7}


In [75]:
#pad_sequences

In [76]:
pad_sequences([[1, 2, 3], [3, 4, 5, 6], [7, 8]], maxlen=3, padding='pre')

array([[1, 2, 3],
       [4, 5, 6],
       [0, 7, 8]], dtype=int32)

In [77]:
#word embedding/embedding vector
#embedding()= embedding layer

In [78]:
# Tokenization: Split sentences into lists of words (tokens)
tokenized_text = [['Hope', 'to', 'see', 'you', 'soon'], ['Nice', 'to', 'see', 'you', 'again']]


In [79]:
# Integer encoding: Each word is mapped to a unique integer
encoded_text = [[0, 1, 2, 3, 4],[5, 1, 2, 3, 6]]

In [80]:
# Embedding layer input parameters
vocab_size = 7        # Total number of unique words (indices 0 to 6)
embedding_dim = 2     # Each word will be represented by a 2-dimensional vector


#define input layer that expects a sequence of length 5
input_seq = Input(shape=(5,))  

# Create an embedding layer that maps each integer (word index) to a 2-dimensional dense vector
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=5)(input_seq)

#model construction : takes the input sequences and outputs corresponding sequence of embedding vectors
model = Model(inputs=input_seq, outputs=embedding_layer)

In [81]:
#Prepares a batch (of size 1) containing a single sequence of 5 word indices
example = np.array([[0, 1, 2, 3, 4]])

#model prediction converts each integer in the input sequence to its corresponding 2-dimensional embedding vector, resulting in a 3D output tensor with shape (1, 5, 2) — one batch, five words, two features per word.
output = model.predict(example)
print(output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[[[ 0.01385099  0.02003599]
  [-0.02630205  0.02985834]
  [-0.04212302  0.02316945]
  [-0.00268266 -0.04340204]
  [-0.00978275  0.02498985]]]


In [82]:
#modeling

In [99]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


#initialize the sequential model
model = Sequential()

#add an embedding layer
#input_dim =100 -> The size of the vocabulary 
#output_dim = 8 -> each word index will be mapped to an 8 dimensional dense vector
#input_length =10 -> The input sequences are expected to be of length 10
model.add(Embedding(input_dim=100, output_dim=8,input_shape=(10,)))

In [100]:
#flatten the 3D embedding ouput to 10
model.add(Flatten())

#adds a fully connected (Dense) layer with 32 neurons and ReLU activation.
model.add(Dense(32, activation='relu')) # 완전연결층

#the output layer with 1 neuron and sigmoid activation, which is commonly used for binary classification tasks
model.add(Dense(1, activation='sigmoid')) # 출력층

In [101]:
model.summary()