In [13]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Masking, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [14]:
df = pd.read_csv('Downloads/shuffled_reviews.csv')

In [15]:
def simple_tokenize(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)          # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)           # remove non-letter characters
    return text.split()                           # basic whitespace tokenization

df['tokens'] = df['review'].apply(simple_tokenize)

In [16]:
print("Training Word2Vec on entire dataset...")
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, workers=4, sg=1)
embedding_dim = w2v_model.vector_size
print("Word2Vec training complete. Vocabulary size:", len(w2v_model.wv))


Training Word2Vec on entire dataset...
Word2Vec training complete. Vocabulary size: 56173


In [17]:
def tokens_to_vectors(tokens):
    return [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]

vector_sequences = [tokens_to_vectors(tokens) for tokens in df['tokens']]

In [18]:
max_len = 200
padded_sequences = pad_sequences(vector_sequences, maxlen=max_len, dtype='float32', 
                                 padding='post', truncating='post', value=0.0)

In [19]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['sentiment'])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [21]:
model = Sequential([
    Input(shape=(max_len, embedding_dim)),
    Masking(mask_value=0.0),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [22]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]

In [23]:
history = model.fit(X_train, y_train, 
                    epochs=10, 
                    batch_size=64, 
                    validation_split=0.1, 
                    callbacks=callbacks)

Epoch 1/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 559ms/step - accuracy: 0.6161 - loss: 0.6278



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 585ms/step - accuracy: 0.6164 - loss: 0.6276 - val_accuracy: 0.7270 - val_loss: 0.5171
Epoch 2/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 599ms/step - accuracy: 0.8086 - loss: 0.4376



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 618ms/step - accuracy: 0.8086 - loss: 0.4375 - val_accuracy: 0.8195 - val_loss: 0.4681
Epoch 3/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597ms/step - accuracy: 0.8143 - loss: 0.4255



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 616ms/step - accuracy: 0.8143 - loss: 0.4254 - val_accuracy: 0.8330 - val_loss: 0.3959
Epoch 4/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583ms/step - accuracy: 0.8472 - loss: 0.3647



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 608ms/step - accuracy: 0.8472 - loss: 0.3647 - val_accuracy: 0.8435 - val_loss: 0.3755
Epoch 5/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597ms/step - accuracy: 0.8554 - loss: 0.3487



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 618ms/step - accuracy: 0.8554 - loss: 0.3487 - val_accuracy: 0.8485 - val_loss: 0.3452
Epoch 6/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 591ms/step - accuracy: 0.8656 - loss: 0.3258 - val_accuracy: 0.8330 - val_loss: 0.3675
Epoch 7/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 570ms/step - accuracy: 0.8641 - loss: 0.3274



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 591ms/step - accuracy: 0.8641 - loss: 0.3274 - val_accuracy: 0.8660 - val_loss: 0.3283
Epoch 8/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 591ms/step - accuracy: 0.8747 - loss: 0.3031 - val_accuracy: 0.8590 - val_loss: 0.3284
Epoch 9/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 586ms/step - accuracy: 0.8737 - loss: 0.3052 - val_accuracy: 0.8520 - val_loss: 0.3460
Epoch 10/10
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 554ms/step - accuracy: 0.8821 - loss: 0.2917



[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 577ms/step - accuracy: 0.8821 - loss: 0.2916 - val_accuracy: 0.8740 - val_loss: 0.3272


In [24]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:}%')

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - accuracy: 0.8855 - loss: 0.2960
Test Accuracy: 87.73999810218811%


In [26]:
model.save("best_model.h5")



In [27]:
w2v_model.save('word2vec_model.model')
