In [137]:
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [138]:
# Get the current working directory
current_dir = Path.cwd()

# Construct the full path
data_path = (current_dir / '..' / 'data' / 'data.csv').resolve()

In [139]:
# Check if Torch version can be used with GPU
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())

True
12.1
8907


In [140]:
# Load the data file into a dataframe
data = pd.read_csv(data_path)

# Select columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]
# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

data.head()

Unnamed: 0,primaryCategories,reviews.text,reviews.rating
0,Health & Beauty,I order 3 of them and one of the item is bad q...,3
1,Health & Beauty,Bulk is always the less expensive way to go fo...,4
2,Health & Beauty,Well they are not Duracell but for the price i...,5
3,Health & Beauty,Seem to work as well as name brand batteries a...,5
4,Health & Beauty,These batteries are very long lasting the pric...,5


In [141]:
print(data["reviews.text"][3])

Seem to work as well as name brand batteries at a much better price


In [142]:
# Check the unique group combinations before applying the concatenation
print("\nUnique Groups:\n", data.groupby(['primaryCategories', 'reviews.rating']).size())

# Group reviews by primary category and star rating and concatenate all reviews within each group
grouped_reviews = data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Add a new column with the length of the concatenated reviews to verify
grouped_reviews['text_length'] = grouped_reviews['reviews.text'].apply(len)


Unique Groups:
 primaryCategories            reviews.rating
Animals & Pet Supplies       3                    1
                             4                    1
                             5                    4
Electronics                  1                  187
                             2                  183
                             3                  551
                             4                 3703
                             5                 9371
Electronics,Furniture        5                    2
Electronics,Media            1                    2
                             2                    2
                             3                    3
                             4                   42
                             5                  136
Health & Beauty              1                  751
                             2                  395
                             3                  534
                             4                 1385
   

In [143]:
# Print the result to verify
grouped_reviews.head(10)

Unnamed: 0,primaryCategories,reviews.rating,reviews.text,text_length
0,Animals & Pet Supplies,3,The price of the item was very good compared t...,225
1,Animals & Pet Supplies,4,I am very pleased with the dog crate. Very stu...,139
2,Animals & Pet Supplies,5,I replaced my cat's x-large litter box to this...,350
3,Electronics,1,We are unable to connect WIFI on this product....,51125
4,Electronics,2,My initial impression of this was very good. T...,49711
5,Electronics,3,If you're looking for something to keep your l...,108327
6,Electronics,4,"ASIDE FROM THE FACT THAT THE SCREEN IS SMALL, ...",668795
7,Electronics,5,Great case to keep everything in its place! My...,1390885
8,"Electronics,Furniture",5,After researching The Amazon Echo and the Goog...,342
9,"Electronics,Media",1,This is not an upgrade by any means! My three ...,621


In [145]:
# Inspect concatenated text for a specific group
category = 'Electronics'
rating = '5'

specific_group_text = grouped_reviews[(grouped_reviews['primaryCategories'] == category) & (grouped_reviews['reviews.rating'] == rating)]['reviews.text'].values[0]
print(f"Concatenated reviews for {category} with rating {rating}:\n")
print(specific_group_text[:6000])  # Print the first 6000 characters to inspect

Concatenated reviews for Electronics with rating 5:



In [146]:
print(grouped_reviews["reviews.text"][3])

We are unable to connect WIFI on this product. We have tried to follow the instructions to no avail. It is a useless and expensive item sitting on our table. Since I purchased returned couple of times now is dead I have to return it again. My first problem didn't paired. This review can not be valid due to the device did not work for me. I would put it down. Just upgraded to the ECHO instead of the tap. We were so excited to get the tap since it was a larger speaker than the dot and unlike the echo has a battery charger with it to make it portable. We already have an echo which works perfectly in our home. When listening to online radio on the tap it cut in and out constantly. We returned it and exchanged gor the dot. hard to use, and does not support Guam. online instructions not clear It's very tin sounding no bass sounds terrible bought a bose sound link and a amazon dot to hook up to it the speaker voice quality is terrible compare the similar size my logitech UE BOOM.the price is 

In [147]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [148]:
# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(device)

cuda


## First try

In [127]:
def summarize_text(text):
    inputs = tokenizer(text, max_length=1024, return_tensors='pt', truncation=True)
    inputs = inputs.to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [128]:
# Apply summarization to each group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

In [129]:
# Display the summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

Primary Category: Animals & Pet Supplies, Star Rating: 3
Summary: The price of the item was very good compared to other cages however the item itself came tilted so now my dog has a tilted dog crate. The tilt to the gates are noticeable but not very drastic. For the price I'll live with it.

Primary Category: Animals & Pet Supplies, Star Rating: 4
Summary: I am very pleased with the dog crate. Very sturdy. No damage or flaws. Was better than some others that I have purchased in a retail store.

Primary Category: Animals & Pet Supplies, Star Rating: 5
Summary: I replaced my cat's x-large litter box to this smaller sized litter box... It's easy to clean and takes less kitty litter to fill it which I like Great price, good value!

Primary Category: Electronics, Star Rating: 1
Summary: We are unable to connect WIFI on this product. We have tried to follow the instructions to no avail. It is a useless and expensive item sitting on our table. Since I purchased returned couple of times now is

In [38]:
# Save the summarized reviews to a CSV file
output_path = current_dir / 'summarized_reviews.csv'
grouped_reviews.to_csv(output_path, index=False)
print(f"Summarized reviews saved to {output_path}")

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Pedro\\Documents\\Ironhack\\SixthWeek\\Project_2\\project-2-nlp-business-case-automated-customers-reviews\\solution\\transformers\\summarized_reviews.csv'

In [26]:
data_summaries = pd.read_csv(output_path)

In [28]:
data_summaries.head()

Unnamed: 0,primaryCategories,reviews.rating,reviews.text,summary
0,Animals & Pet Supplies,3,The price of the item was very good compared t...,The price of the item was very good compared t...
1,Animals & Pet Supplies,4,I am very pleased with the dog crate. Very stu...,summarize: I am very pleased with the dog crat...
2,Animals & Pet Supplies,5,I replaced my cat's x-large litter box to this...,"Sturdy, lightweight, tops lifts for easy acces..."
3,Electronics,1,We are unable to connect WIFI on this product....,This review can not be valid due to the device...
4,Electronics,2,My initial impression of this was very good. T...,There are 50 pages with 8 disc pockets each (4...


In [130]:
# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:700]}...")  # Print only the first 700 characters of the original reviews
    print(f"Summary: {row['summary']}\n")

Primary Category: Animals & Pet Supplies, Star Rating: 3
Original Reviews: The price of the item was very good compared to other cages however the item itself came tilted so now my dog has a tilted dog crate. The tilt to the gates are noticeable but not very drastic. For the price I'll live with it....
Summary: The price of the item was very good compared to other cages however the item itself came tilted so now my dog has a tilted dog crate. The tilt to the gates are noticeable but not very drastic. For the price I'll live with it.

Primary Category: Animals & Pet Supplies, Star Rating: 4
Original Reviews: I am very pleased with the dog crate. Very sturdy. No damage or flaws. Was better than some others that I have purchased in a retail store....
Summary: I am very pleased with the dog crate. Very sturdy. No damage or flaws. Was better than some others that I have purchased in a retail store.

Primary Category: Animals & Pet Supplies, Star Rating: 5
Original Reviews: I replaced my cat

## Second try

In [149]:
# Function to summarize text, handling long texts by chunking
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30):
    # Split the text into chunks that fit within the model's maximum input length
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_chunk_length)
    chunk_size = max_chunk_length
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        chunk_summaries.append(summary)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

In [153]:
import gc

gc.collect()

0

In [150]:
# Apply summarization to each group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

In [None]:
# Display the summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

In [None]:
# Save the summarized reviews to a CSV file
output_path = current_dir / 'summarized_reviews.csv'
grouped_reviews.to_csv(output_path, index=False)
print(f"Summarized reviews saved to {output_path}")

In [151]:
# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:700]}...")  # Print only the first 700 characters of the original reviews
    print(f"Summary: {row['summary']}\n")

Primary Category: Animals & Pet Supplies, Star Rating: 3
Original Reviews: The price of the item was very good compared to other cages however the item itself came tilted so now my dog has a tilted dog crate. The tilt to the gates are noticeable but not very drastic. For the price I'll live with it....
Summary: The price of the item was very good compared to other cages however the item itself came tilted so now my dog has a tilted dog crate. The tilt to the gates are noticeable but not very drastic. For the price I'll live with it.

Primary Category: Animals & Pet Supplies, Star Rating: 4
Original Reviews: I am very pleased with the dog crate. Very sturdy. No damage or flaws. Was better than some others that I have purchased in a retail store....
Summary: I am very pleased with the dog crate. Very sturdy. No damage or flaws. Was better than some others that I have purchased in a retail store.

Primary Category: Animals & Pet Supplies, Star Rating: 5
Original Reviews: I replaced my cat

## Many tries later

In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Visualize the reviews and their summaries and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")