In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, GRU, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Bidirectional
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader as api
import gensim
import numpy as np
from tensorflow import keras
import tensorflow as tf
from textblob import TextBlob
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU

In [2]:
dataset = load_dataset("amazon_us_reviews", "Apparel_v1_00")

Found cached dataset amazon_us_reviews (C:/Users/Arshad's pc/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame.from_dict(dataset["train"])

In [None]:
df.to_csv("amazon_us_reviews_apparel.csv", index=False)

In [None]:
reviews = dataset["train"]

In [None]:
df.info()

In [None]:
sns.countplot(x='overall', data=df)

In [None]:
plt.title('Distribution of Overall Ratings')
plt.xlabel('Overall Rating')
plt.ylabel('Count')

In [None]:
plt.show()

In [None]:
X = df["reviewText"]
y = df["score_pos_neg_diff"]

In [None]:
fig = plt.figure(figsize=(7,7))
colors = ("red","gold","yellowgreen","cyan","orange")
wp = {'linewidth':2, 'edgecolor':'black'}
tags = df['overall'].value_counts()
explode = (0.1,0.1,0.2,0.3,0.2)
tags.plot(kind='pie', autopct='%1.1f',colors=colors, shadow=True,
          startangle=0, wedgeprops=wp, explode=explode, label='')
plt.title('Distribution of the different ratings')
plt.show()

In [None]:
df.drop(['reviewerName', 'reviewTime'], axis=1, inplace=True)
df.dropna(inplace=True)

In [None]:
df['sentiment'] = df['overall'].apply(lambda x: 1 if x >= 3 else 0)
df.drop(['overall'], axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['sentiment'], test_size=0.2, random_state=42)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['sentiment'], test_size=0.2, random_state=42)


In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


In [None]:
text_seq = tokenizer.texts_to_sequences("reviewText")
text_seq = pad_sequences(text_seq, maxlen=100)

In [None]:
X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')

In [None]:
word2vec_model = api.load('word2vec-google-news-300')

In [None]:
embedding_matrix = np.zeros((10000, 300))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_vector = word2vec_model[word]
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=300, weights=[embedding_matrix], input_length=100, trainable=False),
    Bidirectional(LSTM(64, dropout=0.2)),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_data=(X_test_padded, y_test))

In [None]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss over Time')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy over Time')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model_cnn = Sequential([
    Embedding(input_dim=10000, output_dim=300, weights=[embedding_matrix], input_length=100, trainable=False),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D()
    ])

In [None]:
model_cnn = Sequential([
    Embedding(input_dim=10000, output_dim=300, weights=[embedding_matrix], input_length=100, trainable=False),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(3),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history_cnn = model_cnn.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

In [None]:
loss, accuracy = model_cnn.evaluate(X_test_padded, y_test)
print("CNN Model - Loss: {:.2f}, Accuracy: {:.2f}%".format(loss, accuracy * 100))

In [None]:
model_gru = Sequential([
    Embedding(input_dim=10000, output_dim=300, weights=[embedding_matrix], input_length=100, trainable=False),
    GRU(64, dropout=0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history_gru = model_gru.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

In [None]:
loss, accuracy = model_gru.evaluate(X_test_padded, y_test)
print("GRU Model - Loss: {:.2f}, Accuracy: {:.2f}%".format(loss, accuracy * 100))

In [None]:
model_lstm = Sequential([
    Embedding(input_dim=10000, output_dim=300, weights=[embedding_matrix], input_length=100, trainable=False),
    LSTM(64, dropout=0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history_lstm = model_lstm.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

In [None]:
loss, accuracy = model_lstm.evaluate(X_test_padded, y_test)
print("LSTM Model - Loss: {:.2f}, Accuracy: {:.2f}%".format(loss, accuracy * 100))