In [None]:
import pandas as pd

data = pd.read_csv("datasets/amazon_reviews.csv")

In [None]:
data.head()

In [None]:
data['class_index'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
data.fillna('', inplace=True)
data.isnull().sum()

In [None]:
data['class_index'] = data['class_index'] - 1
data.head()


In [None]:
data['review'] = data['review_title'] + ' ' + data['review_text']
data.head()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    return text

data['review'] = data['review'].apply(preprocess_text)


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['review'])
sequences = tokenizer.texts_to_sequences(data['review'])

# Padding sequences
max_length = 100  # Define a max sequence length
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = np.array(data['class_index'])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import keras_tuner as kt

def build_model(hp):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=hp.Choice('embedding_dim', [64, 128, 256]), input_length=max_length),
        LSTM(hp.Int('lstm_units', min_value=32, max_value=256, step=32), return_sequences=True),
        Dropout(hp.Float('dropout', 0.2, 0.5, step=0.1)),
        LSTM(hp.Int('lstm_units_2', min_value=32, max_value=128, step=32)),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
tuner = kt.BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    directory='bayesian_search',
    project_name='lstm_tuning'
)
tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best LSTM Units: {best_hps.get('lstm_units')}")
print(f"Best Dropout Rate: {best_hps.get('dropout')}")
print(f"Best Learning Rate: {best_hps.get('learning_rate')}")
