In [1]:
import numpy as np
from gensim.models import Word2Vec

# Load your trained Word2Vec model
w2v_model = Word2Vec.load('word2vec.model')

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
token=Tokenizer()

In [4]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
df=pd.read_csv("all_kindle_review .csv")
df=df[['rating','reviewText']]
document=df.reviewText.astype(str).apply(simple_preprocess)

In [7]:
token.fit_on_texts(df['reviewText'].astype(str))
seq=token.texts_to_sequences(df['reviewText'].astype(str))

In [8]:
df['rating']=df.rating.apply(lambda x: 0 if x<3 else 1 )

In [9]:
df['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [11]:
# Pad sequences
MAX_LENGTH = 100
X = pad_sequences(seq, maxlen=MAX_LENGTH)
# Labels
y = df['rating'].values

In [13]:
# Embedding matrix
vocab_size = len(token.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in token.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
              input_length=MAX_LENGTH, trainable=False),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # For binary classification
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [15]:
model.fit(X, y, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 95ms/step - accuracy: 0.7020 - loss: 0.5637 - val_accuracy: 0.7621 - val_loss: 0.4727
Epoch 2/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 92ms/step - accuracy: 0.7757 - loss: 0.4528 - val_accuracy: 0.8029 - val_loss: 0.4233
Epoch 3/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 100ms/step - accuracy: 0.7910 - loss: 0.4312 - val_accuracy: 0.7825 - val_loss: 0.4511
Epoch 4/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 100ms/step - accuracy: 0.8109 - loss: 0.4100 - val_accuracy: 0.8163 - val_loss: 0.4042
Epoch 5/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 95ms/step - accuracy: 0.8152 - loss: 0.3990 - val_accuracy: 0.8304 - val_loss: 0.3875


<keras.src.callbacks.history.History at 0x2259daec410>

In [16]:
from sklearn.metrics import classification_report

# Predict on training data (or use a test set)
y_pred = (model.predict(X) > 0.5).astype("int32")
print(classification_report(y, y_pred))


[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step
              precision    recall  f1-score   support

           0       0.76      0.75      0.76      4000
           1       0.88      0.88      0.88      8000

    accuracy                           0.84     12000
   macro avg       0.82      0.82      0.82     12000
weighted avg       0.84      0.84      0.84     12000



In [17]:
def predict_sentiment(review):
    seq1 = token.texts_to_sequences([review])
    padded = pad_sequences(seq1, maxlen=MAX_LENGTH)
    pred = model.predict(padded)
    return 'Positive' if pred > 0.5 else 'Negative'

# Example
print(predict_sentiment("The Kindle is amazing and works great!"))
print(predict_sentiment("Battery life is terrible and I hate it."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 639ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
Negative
