In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [None]:
# Padding sequences
max_length = max([len(x) for x in train_sequences])  # Find the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [None]:
# Train Word2Vec model
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in train_data]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Prepare embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [None]:
# Convert sequences to Word2Vec embeddings
def convert_to_word2vec(sequences, embedding_matrix, max_length):
    word2vec_sequences = np.zeros((len(sequences), max_length, embedding_dim))
    for i, sequence in enumerate(sequences):
        for j, word_index in enumerate(sequence):
            if word_index != 0:  # Skip padding
                word_vec = embedding_matrix[word_index]
                word2vec_sequences[i, j] = word_vec
    return word2vec_sequences

In [None]:
train_word2vec = convert_to_word2vec(train_padded, embedding_matrix, max_length)
test_word2vec = convert_to_word2vec(test_padded, embedding_matrix, max_length)

In [None]:
# Inspect Word2Vec encoded sequences
print("Shape of train_word2vec:", train_word2vec.shape)
print("Shape of test_word2vec:", test_word2vec.shape)
print("Sample Word2Vec encoded train sequence:\n", train_word2vec[0])
print("Sample Word2Vec encoded test sequence:\n", test_word2vec[0])

Shape of train_word2vec: (22802, 106, 100)
Shape of test_word2vec: (5701, 106, 100)
Sample Word2Vec encoded train sequence:
 [[-0.72729731  1.41205347  0.6908955  ... -1.16739058  0.14696582
   0.66753608]
 [-0.19192421  0.36437336  0.18654716 ... -0.30756003  0.04675481
   0.16850248]
 [-0.02013207  0.05818575  0.03467382 ... -0.04615525 -0.00236784
   0.02701318]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
Sample Word2Vec encoded test sequence:
 [[-0.04658628  0.0851171   0.04052103 ... -0.06102676  0.01058459
   0.02753298]
 [-0.06167693  0.11658745  0.04834437 ... -0.09984828  0.02152216
   0.04867287]
 [-0.00674185 -0.00117609  0.00085781 ... -0.00459011  0.00280366
   0.00420339]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0. 