# Amazon Reviews Sentiment Modeling 


<b>Author:</b> Przemyslaw Niedziela (przemyslaw.niedziela98@gmail.com) <br> 
<b>Date:</b> Nov 2024 <br>
<br> <br> 

TL;DR <br>
WIP

<br> <br> 
Table of contents: 

In [2]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


In [3]:
DATASET_PATH = "/Users/przemyslawniedziela/.cache/kagglehub/datasets/arhamrumi/amazon-product-reviews/versions/1/Reviews.csv"

CONTRACTIONS = {
    "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will",
    "'t": " not", "'ve": " have", "'m": " am"
}

### Preprocessing 

Expanding contractions, removing stopwords, lammatization and tokenization.

In [4]:
dataset = pd.read_csv(DATASET_PATH, index_col = [0])

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def expand_contractions(text):
    """Expand common contractions in text."""
    for contraction, expanded in CONTRACTIONS.items():
        text = re.sub(contraction, expanded, text)
    return text

def preprocess_text(text):
    """
    Preprocess the input text by cleaning, normalizing, and tokenizing.

    - Convert text to lowercase.
    - Expand contractions.
    - Remove punctuation and special characters.
    - Remove numbers.
    - Remove stopwords.
    - Lemmatize the tokens to their base forms.
    """
    text = expand_contractions(text.lower())
    text = re.sub(r'[^a-z\s]', '', text) 
    tokens = word_tokenize(text)  
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]    
    return ' '.join(tokens)

dataset['clean_text'] = dataset['Text'].apply(preprocess_text)

text_data = dataset['clean_text']
tokenizer = Tokenizer(num_words=20000)  
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=100)


In [9]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(text_data)

n_clusters = 2  
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
pseudo_labels = kmeans.fit_predict(X_tfidf)

mlb = MultiLabelBinarizer()
cluster_labels = mlb.fit_transform([[label] for label in pseudo_labels])


In [10]:
def create_model(vocab_size, max_seq_len, num_labels):
    inputs = Input(shape=(max_seq_len,))
    x = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len)(inputs)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    outputs = Dense(num_labels, activation='sigmoid')(x) 
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

num_labels = cluster_labels.shape[1]
vocab_size = min(len(word_index) + 1, 20000) 
model = create_model(vocab_size, max_seq_len, num_labels)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
X_train, X_val, y_train, y_val = train_test_split(data, cluster_labels, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/10
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11697s[0m 823ms/step - accuracy: 0.8797 - loss: 0.1157 - val_accuracy: 0.9399 - val_loss: 0.0585
Epoch 2/10
[1m 5076/14212[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m8:45[0m 58ms/step - accuracy: 0.9508 - loss: 0.0496