In [3]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv("flipkart_product_cleaned_final_sentiment.csv")

In [5]:
df.head()

Unnamed: 0,ProductName,Price,Rate,Review,Summary,sentiment
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,3999,1,super !,great cooler excel air flow price amaz unbelie...,0.65
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,3999,1,awesom,best budget fit cooler nice cool,0.5875
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,3999,1,fair,qualiti good power air decent,0.433333
3,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,3999,0,useless product,bad product fan,-0.7
4,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,3999,1,fair,ok ok product,0.5


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [7]:
df['Summary'] = df['Summary'].fillna('')

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df['Summary'], df['Rate'], test_size=0.2, random_state=7)

In [9]:
# Tokenize and pad sequences for LSTM
max_words = 5000
max_len = 100

In [10]:
tokenizer = Tokenizer(num_words=max_words)  # Tokenizer(num_words=max_words): Creates a tokenizer object that keeps the top max_words (5000) most frequent words in the training data.
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)  # pad_sequences(x_train_seq, maxlen=max_len): Pads each sequence to ensure that all sequences in x_train_seq have the same length (max_len = 100). If a sequence is shorter than max_len, it is padded with zeros.
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len)

In [11]:
# Build the LSTM model
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),Dropout(0.5), # LSTM(64, return_sequences=True): First LSTM layer with 64 units. return_sequences=True means that it returns the full sequence of outputs for each input (required for stacking another LSTM layer on top).
                                                  # LSTM(64): Second LSTM layer with 64 units. This time return_sequences is not specified, so it returns only the last output in the output sequence.
    LSTM(64),Dropout(0.5), # Dropout(0.5): Applies dropout to the layer's output. Dropout rate is 0.5, meaning 50% of the neurons will be randomly set to zero during training to prevent overfitting.
    Dense(1, activation='sigmoid') # Dense(1, activation='sigmoid'): A dense (fully connected) layer with a single neuron. The sigmoid activation function outputs a value between 0 and 1, suitable for binary classification.
])

# Define the optimizer with a specified learning rate
optimizer = Adam(learning_rate=0.001)

lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])



In [12]:
# Train the LSTM model
lstm_model.fit(x_train_pad, y_train, epochs=3, batch_size=32, validation_split=0.2)

Epoch 1/3
[1m3301/3301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 114ms/step - accuracy: 0.9124 - loss: 0.2473 - val_accuracy: 0.9348 - val_loss: 0.1771
Epoch 2/3
[1m3301/3301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 111ms/step - accuracy: 0.9412 - loss: 0.1682 - val_accuracy: 0.9361 - val_loss: 0.1781
Epoch 3/3
[1m3301/3301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 113ms/step - accuracy: 0.9447 - loss: 0.1566 - val_accuracy: 0.9344 - val_loss: 0.1798


<keras.src.callbacks.history.History at 0x18693c2be90>

In [11]:
# Evaluate the LSTM model
lstm_y_pred = (lstm_model.predict(x_test_pad) > 0.5).astype("int32")
print(f'LSTM Accuracy: {accuracy_score(y_test, lstm_y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, lstm_y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, lstm_y_pred)}')

[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 51ms/step
LSTM Accuracy: 0.9309153713298791
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76      5134
           1       0.95      0.97      0.96     27869

    accuracy                           0.93     33003
   macro avg       0.88      0.85      0.86     33003
weighted avg       0.93      0.93      0.93     33003

Confusion Matrix:
[[ 3701  1433]
 [  847 27022]]


In [None]:
# import joblib
# import pickle
# Save the model
# joblib.dump(lstm_model, 'lstm_model.h5') 
# h5 file --> The full form of an H5 file is Hierarchical Data Format version 5, 
# commonly abbreviated as HDF5. This file format is designed to store and organize large amounts of data, 
# particularly in complex hierarchical structures. HDF5 is widely used in various fields such as scientific 
# computing, data analysis, and machine learning due to its ability to efficiently manage large datasets 
# and support complex data relationships.

# Save the tokenizer to a file
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
import pickle
lstm_model.save('lstm_model.keras')
# Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

now write streamlit_lstm.py file 
and on cmd run that --> streamlit run streamlit_lstm.py