In [7]:
# prompt: import google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!pip install keras-preprocessing



In [9]:
import tensorflow as tf


In [10]:
import os
os.chdir('/content/drive/MyDrive/Text Analytics/HW2')

In [11]:
# Import the 'random' module
import random

# Import the 'numpy' module with the alias 'np'
import numpy as np

# Import the 'keras' library
import keras

# Import the 'pad_sequences' function from 'keras_preprocessing.sequence'
from keras_preprocessing.sequence import pad_sequences

# Import the 'Sequential' class from 'keras.models'
from keras.models import Sequential

# Import necessary layers from 'keras.layers'
from keras.layers import Embedding, LSTM, Dense

# Import the 'to_categorical' function from 'keras.utils'
from keras.utils import to_categorical

# Import the 'Tokenizer' class from 'keras.preprocessing.text'
from keras.preprocessing.text import Tokenizer

# Import the 'drive' module from 'google.colab'
from google.colab import drive

# Import the 'pandas' library with the alias 'pd'
import pandas as pd

# Mount the Google Drive to access files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Read a CSV file containing Amazon reviews into a Pandas DataFrame
amazon_reviews = pd.read_csv(
    "Amazon_Comments.csv", delimiter="^"
)

# Rename the columns of the DataFrame for clarity
amazon_reviews.columns = [
    "ProductID",
    "ReviewID",
    "ReviewTitle",
    "ReviewTime",
    "Verified",
    "ReviewContent",
    "ReviewRating",
]

# Create a new column 'Sentiment' based on the 'ReviewRating' using a lambda function
amazon_reviews["Sentiment"] = amazon_reviews["ReviewRating"].apply(
    lambda x: "negative" if x < 3 else "positive"
)

# Filter the reviews with negative sentiment
negative = amazon_reviews[amazon_reviews["Sentiment"] == "negative"]

# Filter the reviews with positive sentiment
positive = amazon_reviews[amazon_reviews["Sentiment"] == "positive"]

# Extract the 'ReviewContent' from negative reviews and convert to a list
negative_review_texts = negative["ReviewContent"].tolist()

# Extract the 'ReviewContent' from positive reviews and convert to a list
positive_review_texts = positive["ReviewContent"].tolist()

# Print the list of negative review texts
print("Negative review: ", negative_review_texts)

# Print the list of positive review texts
print("Positive review: ", positive_review_texts)




In [14]:
# Create a Tokenizer for negative reviews
tokenizer_negative_reviews = Tokenizer()

# Fit the Tokenizer on the text data from negative reviews
tokenizer_negative_reviews.fit_on_texts(negative_review_texts)

# Calculate the total number of unique words in the negative reviews
total_words_negative_reviews = len(tokenizer_negative_reviews.word_index) + 1

# Create a Tokenizer for positive reviews
tokenizer_positive_reviews = Tokenizer()

# Fit the Tokenizer on the text data from positive reviews
tokenizer_positive_reviews.fit_on_texts(positive_review_texts)

# Calculate the total number of unique words in the positive reviews
total_words_positive_reviews = len(tokenizer_positive_reviews.word_index) + 1

# Print the total number of unique words in negative reviews
print(total_words_negative_reviews)

# Print the total number of unique words in positive reviews
print(total_words_positive_reviews)


1695
4805


In [15]:
# Define a function to generate sequences
def generate_sequence_list(tokenizer, data):
    sequence_list = []
    for text in data:
        # Convert text to a list of tokens using the provided tokenizer
        token_list = tokenizer.texts_to_sequences([text])[0]
        for i in range(1, len(token_list)):
            # Create n-gram sequences by gradually extending the token list
            n_gram_sequence = token_list[: i + 1]
            sequence_list.append(n_gram_sequence)
    return sequence_list


# Generate sequences from negative reviews using the negative reviews tokenizer
sequence_list_negative = generate_sequence_list(
    tokenizer_negative_reviews, negative_review_texts
)

# Generate sequences from positive reviews using the positive reviews tokenizer
sequence_list_positive = generate_sequence_list(
    tokenizer_positive_reviews, positive_review_texts
)

# Set the maximum sequence length
max_sequence_length = 30

# Pad sequences for negative reviews to have a maximum length of 30 and padding at the beginning
padded_sequences_negative = pad_sequences(
    sequence_list_negative, maxlen=max_sequence_length, padding="pre"
)

# Pad sequences for positive reviews to have a maximum length of 30 and padding at the beginning
padded_sequences_positive = pad_sequences(
    sequence_list_positive, maxlen=max_sequence_length, padding="pre"
)

# Print the padded sequences for negative reviews
print(padded_sequences_negative)

# Print the padded sequences for positive reviews
print(padded_sequences_positive)


[[   0    0    0 ...    0   34  693]
 [   0    0    0 ...   34  693   41]
 [   0    0    0 ...  693   41   31]
 ...
 [   0    0    0 ...   18  277 1694]
 [   0    0  277 ...  277 1694   78]
 [   0  277  381 ... 1694   78  404]]
[[   0    0    0 ...    0    2   35]
 [   0    0    0 ...    2   35  109]
 [   0    0    0 ...   35  109    7]
 ...
 [   0    0    0 ...   36 4804   25]
 [   0    0    0 ...    0   27   12]
 [   0    0    0 ...    0   31   24]]


In [16]:
# Extract the input data for negative reviews by removing the last token
X_negative = padded_sequences_negative[:, :-1]

# Extract the target data for negative reviews, which is the last token
y_negative = padded_sequences_negative[:, -1]

# Convert the target data for negative reviews to one-hot encoding
y_negative = to_categorical(y_negative, num_classes=total_words_negative_reviews)

# Extract the input data for positive reviews by removing the last token
X_positive = padded_sequences_positive[:, :-1]

# Extract the target data for positive reviews, which is the last token
y_positive = padded_sequences_positive[:, -1]

# Convert the target data for positive reviews to one-hot encoding
y_positive = to_categorical(y_positive, num_classes=total_words_positive_reviews)

In [17]:
# Import necessary modules from TensorFlow/Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define a function to create an LSTM model
def create_lstm_model(total_words, sequence_length):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=sequence_length - 1))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Create LSTM models for negative and positive reviews
model_negative_reviews = create_lstm_model(total_words_negative_reviews, max_sequence_length)
model_positive_reviews = create_lstm_model(total_words_positive_reviews, max_sequence_length)

# Train the LSTM models on negative and positive review data
model_negative_reviews.fit(X_negative, y_negative, epochs=10, verbose=1)
model_positive_reviews.fit(X_positive, y_positive, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e64045e1d50>

In [18]:
# Define a function to generate text using a given model
def generate_text_from_model(seed_text, num_words_to_generate, model, max_seq_length, tokenizer):
    generated_text = seed_text
    for _ in range(num_words_to_generate):
        # Convert seed_text to a token list
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # Pad the token list
        token_list = pad_sequences([token_list], maxlen=max_seq_length - 1, padding='pre')
        # Predict the next word
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = ""
        # Map the predicted index to a word
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                predicted_word = word
                break
        # Append the predicted word to seed_text and generated_text
        seed_text += " " + predicted_word
        generated_text += " " + predicted_word
    return generated_text

In [19]:
# Define an initial seed for negative reviews
initial_seed_negative = "The purchased product is bad because"
# Generate a fake negative review
fake_negative_review = generate_text_from_model(
    initial_seed_negative,
    10,
    model_negative_reviews,
    max_sequence_length,
    tokenizer_negative_reviews,
)

# Define an initial seed for positive reviews
initial_seed_positive = "The purchased product is good because"
# Generate a fake positive review
fake_positive_review = generate_text_from_model(
    initial_seed_positive,
    10,
    model_positive_reviews,
    max_sequence_length,
    tokenizer_positive_reviews,
)

# Print the generated negative review
print("Negative Review:")
print(fake_negative_review)

# Print the generated positive review
print("\nPositive Review:")
print(fake_positive_review)


Negative Review:
The purchased product is bad because the water is i have a water is i have

Positive Review:
The purchased product is good because of the price is great for the price i have
