# Homework 2

Name: Xiaohong Liu

ID: 002727174

In [None]:
'''
!pip install setuptools==65.5.0 "wheel<0.40.0"

In [None]:
'''
!pip install d2l==0.17.6
!pip install torch==1.12.0
!pip install torchvision==0.13.0

In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

#### import data 

In [2]:
colnames = [
    "ProductID",
    "ReviewID",
    "ReviewTitle",
    "ReviewTime",
    "Verified",
    "ReviewContent",
    "ReviewRating",
]
amazon = pd.read_csv("Amazon_Comments.csv", sep="^", header=None, names=colnames)


In [4]:
amazon.head(5)

Unnamed: 0,ProductID,ReviewID,ReviewTitle,ReviewTime,Verified,ReviewContent,ReviewRating
0,1,1,These are hands down the best quality bands fo...,2016-01-16,False,These are hands down the best quality bands f...,5.0
1,1,2,High Quality Bands,2016-01-22,False,I just got this set yesterday as well as a se...,5.0
2,1,3,Five Stars,2015-12-27,False,My husband uses these and finds them to be go...,5.0
3,1,4,The resistance is great. I would agree that th...,2016-01-13,False,I got these for Christmas and have been using...,4.0
4,1,5,Good quality product,2016-01-20,False,Haven\t had it long enough to use all of the ...,5.0


#### preprocess data 

In [None]:
import re


def pre_process(x):
    return re.sub(
        "'ve",
        " have",
        re.sub(
            "'s",
            " is",
            re.sub(
                "'m",
                " am",
                re.sub(
                    "'re", " are", re.sub("n't", " not", re.sub(r"\\", "'", x.lower()))
                ),
            ),
        ),
    )


In [5]:
# preprocessing
amazon['ReviewContent']= amazon['ReviewContent'].apply(lambda x: pre_process()) 

In [6]:
# remove missing value
amazon = amazon[amazon['ReviewContent'] != ' ']

#### prepare data 

In [7]:
# separate positive and negative review
positive_reviews = amazon[amazon["ReviewRating"].isin([4, 5])]["ReviewContent"].tolist()
negative_reviews = amazon[amazon["ReviewRating"].isin([1, 2, 3])][
    "ReviewContent"
].tolist()

In [8]:
# Tokenize the reviews and build a vocabulary
tokens = [review.split() for review in negative_reviews + positive_reviews]
vocab = {
    word: idx
    for idx, word in enumerate(set(word for review in tokens for word in review))
}

# Convert tokens to indices and pad sequences
indexed_reviews = [
    [vocab[word] for word in review.split()]
    for review in negative_reviews + positive_reviews
]
# padded_reviews = [torch.tensor(review) for review in indexed_reviews]
padded_reviews = pad_sequence(
    [torch.tensor(review) for review in indexed_reviews], batch_first=True
)

# Create input and target sequences
input_sequences = padded_reviews[:, :-1]
target_sequences = padded_reviews[:, 1:]

# Create DataLoader for training data
train_data = TensorDataset(input_sequences, target_sequences)
batch_size = 2  # Adjust as needed
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)


#### define LSTM and train the model 

In [9]:
# Define a custom LSTM-based text generation model
# num_epoch is set at 100 and thus this might take a while to train
class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMTextGenerator, self).__init__()
        # Define the layers of the model
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Word embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Fully connected layer (output layer)

    def forward(self, x, hidden):
        # Define the forward pass of the model
        embedded = self.embedding(x)  # Embed the input sequence
        output, (hidden, cell) = self.lstm(embedded, hidden)  # Pass through the LSTM layer
        output = self.fc(output)  # Pass through the output layer
        return output, (hidden, cell)

# Set model hyperparameters
vocab_size = len(vocab)
embedding_dim = 256
hidden_dim = 128
num_layers = 2
num_epochs = 100
learning_rate = 0.01

# Initialize the LSTMTextGenerator model
model = LSTMTextGenerator(vocab_size, embedding_dim, hidden_dim, num_layers)

# Define the loss function (cross-entropy) and the optimizer (Adam)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for input_seq, target_seq in train_loader:
        optimizer.zero_grad()
        hidden = None
        output, _ = model(input_seq, hidden)
        loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

print("Training completed.")

Epoch [1/100], Loss: 0.5786888599395752
Epoch [2/100], Loss: 0.7087135910987854
Epoch [3/100], Loss: 0.5375621914863586
Epoch [4/100], Loss: 0.2806489169597626
Epoch [5/100], Loss: 0.3185948431491852
Epoch [6/100], Loss: 0.6932854056358337
Epoch [7/100], Loss: 0.21020928025245667
Epoch [8/100], Loss: 0.3772299289703369
Epoch [9/100], Loss: 0.06534944474697113
Epoch [10/100], Loss: 0.5631225109100342
Epoch [11/100], Loss: 0.18681462109088898
Epoch [12/100], Loss: 0.028448201715946198
Epoch [13/100], Loss: 0.1722036451101303
Epoch [14/100], Loss: 0.06374713778495789
Epoch [15/100], Loss: 0.07380242645740509
Epoch [16/100], Loss: 0.0585782565176487
Epoch [17/100], Loss: 0.023339075967669487
Epoch [18/100], Loss: 0.014055788516998291
Epoch [19/100], Loss: 0.010937402956187725
Epoch [20/100], Loss: 0.013546172529459
Epoch [21/100], Loss: 0.00719092832878232
Epoch [22/100], Loss: 0.007199824322015047
Epoch [23/100], Loss: 0.01557792630046606
Epoch [24/100], Loss: 0.014306537806987762
Epoch [

#### generate text using the trained model 

In [10]:
# Generate text using the trained model (case-insensitive)
def generate_text(model, seed_text, vocab, max_length=50):
    with torch.no_grad():
        model.eval()  # Set the model in evaluation mode to disable gradient computation
        seed_tokens = (
            seed_text.lower().split()
        )  # Convert seed text to lowercase and split it into tokens
        input_seq = [
            vocab[word] for word in seed_tokens
        ]  # Convert seed tokens to their corresponding indices in the vocabulary
        input_seq = torch.tensor(input_seq).view(
            1, -1
        )  # Convert the input sequence to a tensor and reshape it to match model input
        hidden = None  # Initialize the hidden state (typically used in RNNs)
        generated_text = seed_text  # Initialize the generated text with the seed text

        for _ in range(max_length):  # Generate text up to the specified maximum length
            output, hidden = model(
                input_seq, hidden
            )  # Get model's output and update hidden state
            predicted_idx = torch.multinomial(
                torch.exp(output[0, -1, :]), 1
            )  # Sample a word index from the model's output
            predicted_word = [
                word for word, idx in vocab.items() if idx == predicted_idx.item()
            ][
                0
            ]  # Map the index back to a word
            generated_text += (
                " " + predicted_word
            )  # Append the predicted word to the generated text
            input_seq = torch.cat(
                (input_seq, predicted_idx.view(1, -1)), dim=1
            )  # Append the predicted index to the input sequence

    return generated_text  # Return the generated text


##### a negative review 

In [24]:
# Example of generating text for poor rating
seed_text = "they tore"
generated_text = generate_text(model, seed_text, vocab)
# print("Generated Text:", generated_text)

# Remove the remaining repeated words
generated_words = generated_text.split()
filtered_words = [generated_words[0]]
for i in range(1, len(generated_words)):
    if generated_words[i] != generated_words[i - 1]:
        filtered_words.append(generated_words[i])

filtered_generated_text = " ".join(filtered_words)
print("Generated Text:", filtered_generated_text)

Generated Text: they tore up after a little over a year, and i am not very strong... poor quality, also one of the handle got bent (but you can still work out with it). when they tear it can be painful, so i suggest if you must buy this brand, replace every year. measure


##### a positive review 

In [26]:
# Example of generating text for high rating
seed_text = "I just got this set yesterday"
generated_text = generate_text(model, seed_text, vocab)
#print("Generated Text:", generated_text)

# Remove repeated words
generated_words = generated_text.split()
filtered_words = [generated_words[0]]
for i in range(1, len(generated_words)):
    if generated_words[i] != generated_words[i - 1]:
        filtered_words.append(generated_words[i])

filtered_generated_text = " ".join(filtered_words)
print("Generated Text:", filtered_generated_text)

Generated Text: I just got this set yesterday as well as a set from another company so i could compare the quality. the other set seems to be made pretty well, but these from black mountain are higher quality. the nylon webbing and hardware are heavier and have an overall better quality look. i have been using resistance
