In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration
from tqdm import tqdm
from joblib import Parallel, delayed

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Group reviews by primary category and star rating and concatenate all reviews within each group
grouped_reviews = data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on sentences
def split_into_chunks(text, max_chunk_length):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(tokenizer.encode(sentence))
        if current_length + sentence_length <= max_chunk_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')
    return chunks

# Function to summarize text, handling long texts by chunking
def summarize_text(text, max_chunk_length=4096, summary_max_length=50, summary_min_length=20):
    chunks = split_into_chunks(text, max_chunk_length)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        chunk_summaries.append(summary)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to each group of reviews using parallel processing
def process_row(row):
    return summarize_text(row['reviews.text'])

grouped_reviews['summary'] = Parallel(n_jobs=-1, timeout=None)(delayed(process_row)(row) for idx, row in tqdm(grouped_reviews.iterrows(), total=grouped_reviews.shape[0]))

# Display the summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda


100%|██████████| 32/32 [00:01<00:00, 16.39it/s]


In [1]:
# Save the summarized reviews to a CSV file
output_path = current_dir / 'summarized_reviews.csv'
grouped_reviews.to_csv(output_path, index=False)
print(f"Summarized reviews saved to {output_path}")

NameError: name 'current_dir' is not defined

In [1]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration
from tqdm import tqdm

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on sentences
def split_into_chunks(text, max_chunk_length):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(tokenizer.encode(sentence))
        if current_length + sentence_length <= max_chunk_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')
    return chunks

# Function to summarize text, handling long texts by chunking
def summarize_text(text, max_chunk_length=4096, summary_max_length=50, summary_min_length=20):
    chunks = split_into_chunks(text, max_chunk_length)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        chunk_summaries.append(summary)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda


Input ids are automatically padded from 3891 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3858 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3851 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3842 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3811 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3875 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3849 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3806 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3810 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3859 to 4096 to

Primary Category: Electronics, Star Rating: 5
Summary: ) Buy, etc.) I noticed a cheap laptop sleeve (as in cheap quality) was nearly 16! I was like HECK NO! So I jumped on Amazon (btw amazon is the best shoutout to all the hard workers . Nice length on cord. Quality is outstanding! Would I recommend this product Absolutely! Worked on an android phone on a recent trip when a friend forgot their charger. Works great on my Kindle Fire as well! Just what I needed!  is better than the Echo! Portable - YES! Sound Quality - OFF THE CHARTS! I blue-tooth this to my computer and listen to movies with GREAT Dolby sound quality. Great sound in a small package for the money and It does a great job of being portable and easy to use. It does a great job of being portable and easy to use. It does a great job of being portable and easy to use. It does a great job of being portable and ...but you will lose bass if it's at max volume (not recommended if you like bass) We use this speaker all the time. It

In [5]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration
from tqdm import tqdm

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on sentences
def split_into_chunks(text, max_chunk_length):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(tokenizer.encode(sentence))
        if current_length + sentence_length <= max_chunk_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')
    return chunks

# Function to summarize text, handling long texts by chunking
def summarize_text(text, max_chunk_length=4096, summary_max_length=50, summary_min_length=10):
    chunks = split_into_chunks(text, max_chunk_length)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        chunk_summaries.append(summary)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    while len(tokenizer.encode(combined_summary)) > summary_max_length:
        inputs = tokenizer(combined_summary, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return combined_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda
Primary Category: Electronics, Star Rating: 5
Summary: ) Buy, etc.) I noticed a cheap laptop sleeve (as in cheap quality) was nearly 16! I was like HECK NO! So I jumped on Amazon (btw amazon is the best shoutout to all the hard workers

Primary Category: Electronics, Star Rating: 5
Original Reviews: Great case to keep everything in its place! My husband love it!!!! Holds a lot of cds! After discarding and getting rid of broken cd cases, broken cds, and selecting those ones we really like, this binder turned up to be an excellent option to store our favourite cds and dvds and keep them in a small space at our living room, giving us the choice to donate or get rid of those cds towers that took a lot of room, despite looking nice. And because you can turn the pages, you can spot the cd you want to play without the hassle of taking it from a case that is falling apart. After storing them cds, all the cases and printed covers went straight to the recycling centre. We have

In [12]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, BartForConditionalGeneration

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Temporarily use CPU to check if it's a GPU issue
device = 'cpu'
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30):
    chunks = split_into_chunks(text, max_chunk_length)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Ensure correct tensor shapes and types
        try:
            summary_ids = model.generate(
                inputs['input_ids'], 
                max_length=summary_max_length, 
                min_length=summary_min_length, 
                length_penalty=2.0, 
                num_beams=4, 
                early_stopping=True
            )
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            chunk_summaries.append(summary)
        except RuntimeError as e:
            print(f"Error processing chunk: {e}")
            continue
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")



Using device: cpu


KeyboardInterrupt: 

In [7]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from tqdm import tqdm

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to extract key sentences
def extract_key_sentences(text, num_sentences=5):
    # Split text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    # Clean and tokenize sentences
    cleaned_sentences = [clean_and_tokenize(sentence) for sentence in sentences]
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer().fit_transform(cleaned_sentences)
    vectors = vectorizer.toarray()
    
    # Calculate sentence importance
    sentence_scores = linear_kernel(vectors, vectors).sum(axis=1)
    
    # Extract top sentences
    top_sentence_indices = sentence_scores.argsort()[-num_sentences:]
    top_sentences = [sentences[i] for i in top_sentence_indices]
    
    return ' '.join(top_sentences)

# Apply extraction and summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: extract_key_sentences(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Primary Category: Electronics, Star Rating: 5
Summary: She loves it and it is easy for her to use I love my new Fire HD! I had one of the first generation Kindle Fire, and loved it, love this one even more!!! This 8" tablet is much better than the older 7" tablet. Bought this for an older person and it's a simple tablet to use at their age she loves it Bought this tablet for my daughter for christmas, great quality for the price! Clear picture, easy to use and easy to set up, great for surfing the web or watching movies on Fire is great. Great tablet for the grandkids, they love it great item Great tablet for the kids and adults great features like the rubber case and free features for kids I bought this tablet for my daughter and she loves it I buy this tablet for my nephew and he is happy with it. No problems with tablet, kids love it and the size is great My wife and kids love easy to use and the size is perfect I was going to get a Samsung tablet but for doing everything this is be

In [1]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, BartForConditionalGeneration

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Optionally, delete the model and re-load it
del model
torch.cuda.empty_cache()

# Re-load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30):
    chunks = split_into_chunks(text, max_chunk_length)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Ensure correct tensor shapes and types
        try:
            summary_ids = model.generate(
                inputs['input_ids'], 
                max_length=summary_max_length, 
                min_length=summary_min_length, 
                length_penalty=2.0, 
                num_beams=4, 
                early_stopping=True
            )
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            chunk_summaries.append(summary)
        except RuntimeError as e:
            print(f"Error processing chunk: {e}")
            continue
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Primary Category: Electronics, Star Rating: 5
Summary: Amazon Tap Bluetooth and wifi wireless speaker works as advertised. Easy set up, tons of music, easy to use. Great range for asking questions.. Very enjoyable and easy for listening to music selections online. The Kindle Voyage 6 is better than the real thing. This is my third Kindle, I still have the other two; however, this has a camera and much larger screen. Bigger screen, longer battery life and faster loading. Amazon prime you get free movies, books, and games.

Primary Category: Electronics, Star Rating: 5
Original Reviews: Great case to keep everything in its place! My husband love it!!!! Holds a lot of cds! After discarding and getting rid of broken cd cases, broken cds, and selecting those ones we really like, this binder turned up to be an excellent option to store our favourite cds and dvds and keep them in a small space at our living room, giving us the choice to donate or get rid of those cds towers that took a lot of

In [2]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, BartForConditionalGeneration

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Optionally, delete the model and re-load it
del model
torch.cuda.empty_cache()

# Re-load the model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda
Primary Category: Electronics, Star Rating: 5
Summary: Amazon Tap Bluetooth and wifi wireless speaker works as advertised. Easy set up, tons of music, easy to use. Great range for asking questions.. Very enjoyable and easy for listening to music selections online. The Kindle Voyage 6 is better than the real thing. This is my third Kindle, I still have the other two; however, this has a camera and much larger screen. Bigger screen, longer battery life and faster loading. Amazon prime you get free movies, books, and games.

Primary Category: Electronics, Star Rating: 5
Original Reviews: Great case to keep everything in its place! My husband love it!!!! Holds a lot of cds! After discarding and getting rid of broken cd cases, broken cds, and selecting those ones we really like, this binder turned up to be an excellent option to store our favourite cds and dvds and keep them in a small space at our living room, giving us the choice to donate or get rid of those cds towers

In [1]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


tokenizer_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Token indices sequence length is longer than the specified maximum sequence length for this model (9666 > 1024). Running this sequence through the model will result in indexing errors


Primary Category: Electronics, Star Rating: 5
Summary: Great Product, Great Service, Great Price, Great Product, and Great Service! AmazonBasic Vent and Adjustable Laptop Stand is the Best! Best E-reader on the market, but a little pricey for what you get! Amazon Fire HD 8 is the best tablet I've ever owned! Best bang for your buck at a great price! Amazon Fire HD8 is a must have for anyone who loves Amazon and Amazon Prime! Amazon Fire is the best tablet in the market! Amazon Fire HD8 is the Best tablet for the price and great product for the money! Best bang for your buck tablet I've ever bought! Amazon Fire HD HD 8 Kids Edition is the best! Amazon's Kindle Fire FIre is the Best! Great tablet for kids and adults! Great product for the price and great quality for the money! Great features and fun for all ages and ages Best tablet for the price and great quality! Great for kids and adults at a great price! I love this tablet! I would definitely recommend to a friend. Great starter tabl

In [2]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (9483 > 1024). Running this sequence through the model will result in indexing errors


Primary Category: Electronics, Star Rating: 5
Summary: Great Product, Great Service, Great Price, Great Product, and Great Service! AmazonBasic Vent and Adjustable Laptop Stand is the Best! Best E-reader on the market! Amazon Fire HD 8 is the best tablet I've ever owned! Highly recommend this product! Best tablet for the price and features I have ever used! Great tablet for all ages, great price, great features, great battery life! Amazon Fire HD8 is the best tablet for the price! Great for travel, travel, reading, and a great tablet for a great price! Best bang for your buck tablet I've ever bought! Amazon Fire HD 8.50 is a great tablet at a great price for a great product Best tablet for the price and great kid features! Great for travel, travel, school, music, games, movies, etc. Amazon Fire 7 inch tablet is a great product and works great and no issues This Amazon Fire is better then I expected. I love my new Fire tablet Best... Great starter tablet for kids and kids love it! Great

In [1]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category)] # & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    print(f"Summary: {row['summary']}\n")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews


Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Token indices sequence length is longer than the specified maximum sequence length for this model (4506 > 1024). Running this sequence through the model will result in indexing errors


Primary Category: Electronics, Star Rating: 1
Summary: Waste of money and time and effort to get it to work properly. Unable to connect WIFI on this device. Kindle Voyage stopped working after 2 uses. Buy the Samsung Galaxy S9+ or you'll be stuck with a $200 paperweight. Not worth the money, quit after a month of regular use. JUNK. DO NOT BUY THIS ITEM. NOT WORTH IT. You get what you pay for, and it's not worth the money. Save yourself the aggravation and go with an alternative. JUNK JUNK, DO NOT BUY THIS ITEM. I would highly recommend skipping this tablet and getting one that is not tied to any one... This is not like other android tablets in a bad way. Dont have option for password ask you before buying apps once you... Waste of money and time. No support from Best Buy. No warranty. Do not buy from this company. Don’t waste your money. Worst tablet I have ever purchased from Best Buy ever. I would highly recommend skipping this tablet and getting one that is not tied to any... Worst 

Primary Category: Electronics, Star Rating: 1
Original Length: 51125 characters
Summary Length: 1513 characters
Summary: Waste of money and time and effort to get it to work properly. Unable to connect WIFI on this device. Kindle Voyage stopped working after 2 uses. Buy the Samsung Galaxy S9+ or you'll be stuck with a $200 paperweight. Not worth the money, quit after a month of regular use. JUNK. DO NOT BUY THIS ITEM. NOT WORTH IT. You get what you pay for, and it's not worth the money. Save yourself the aggravation and go with an alternative. JUNK JUNK, DO NOT BUY THIS ITEM. I would highly recommend skipping this tablet and getting one that is not tied to any one... This is not like other android tablets in a bad way. Dont have option for password ask you before buying apps once you... Waste of money and time. No support from Best Buy. No warranty. Do not buy from this company. Don’t waste your money. Worst tablet I have ever purchased from Best Buy ever. I would highly recommend skip

In [6]:
# Get the current working directory
current_dir = Path.cwd()

# Construct the full path
data_path = (current_dir / '..' / 'data' / 'data.csv').resolve()

In [9]:
# Save the summarized reviews to a CSV file
output_path = current_dir / 'summarized_reviews_test.csv'
grouped_reviews.to_csv(output_path, index=False)
print(f"Summarized reviews saved to {output_path}")

Summarized reviews saved to C:\Users\Pedro\Documents\Ironhack\SixthWeek\Project_2\project-2-nlp-business-case-automated-customers-reviews\solution\transformers\summarized_reviews_test.csv


In [12]:
for idx, row in grouped_reviews.iterrows():

    print(f"Summary: {row['summary']}\n")

Summary: Great Product, Great Service, Great Price, Great Product, and Great Service! AmazonBasic Vent and Adjustable Laptop Stand is the Best! Best E-reader on the market! Amazon Fire HD 8 is the best tablet I've ever owned! Highly recommend this product! Best tablet for the price and features I have ever used! Great tablet for all ages, great price, great features, great battery life! Amazon Fire HD8 is the best tablet for the price! Great for travel, travel, reading, and a great tablet for a great price! Best bang for your buck tablet I've ever bought! Amazon Fire HD 8.50 is a great tablet at a great price for a great product Best tablet for the price and great kid features! Great for travel, travel, school, music, games, movies, etc. Amazon Fire 7 inch tablet is a great product and works great and no issues This Amazon Fire is better then I expected. I love my new Fire tablet Best... Great starter tablet for kids and kids love it! Great price, great selection of apps, great protect

In [14]:
text_count = "Summary: Great Product, Great Service, Great Price, Great Product, and Great Service! AmazonBasic Vent and Adjustable Laptop Stand is the Best! Best E-reader on the market! Amazon Fire HD 8 is the best tablet I've ever owned! Highly recommend this product! Best tablet for the price and features I have ever used! Great tablet for all ages, great price, great features, great battery life! Amazon Fire HD8 is the best tablet for the price! Great for travel, travel, reading, and a great tablet for a great price! Best bang for your buck tablet I've ever bought! Amazon Fire HD 8.50 is a great tablet at a great price for a great product Best tablet for the price and great kid features! Great for travel, travel, school, music, games, movies, etc. Amazon Fire 7 inch tablet is a great product and works great and no issues This Amazon Fire is better then I expected. I love my new Fire tablet Best... Great starter tablet for kids and kids love it! Great price, great selection of apps, great protection, easy to set up and easy to use! Best tablet for kids and kids who like to play with it! Highly recommended to anyone who is in the market for a new tablet! Best bang for your buck tablet for the buck at a great price! Amazon Fire HD8 is the best tablet I've ever owned! Highly recommend this tablet for the new kindle! Best Tablet for the price point at a great price. Great tablet for general use and easy..."
print(len(text_count))

1414


## Trying summarize until limit

In [4]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display
import re

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        print(f"Combined summary length before recursion: {len(combined_summary)}, Token length: {len(tokenizer.encode(combined_summary))}")
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
        print(f"Recursive summary length: {len(final_summary)}, Token length: {len(tokenizer.encode(final_summary))}")
    else:
        final_summary = combined_summary
        print(f"Final summary length: {len(final_summary)}, Token length: {len(tokenizer.encode(final_summary))}")

    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = re.split(r'\. |, ', text) # Split commas too, testing this...
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
def summarize_until_limit(text, max_length=1024, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    summarization_count = 0
    summarization_count += 1
    print (f"Summarization {summarization_count}")
    summary = summarize_text(text, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    while len(summary) > max_length:
        summarization_count += 1
        print (f"Summarization {summarization_count}")
        summary = summarize_text(summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    return summary

grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_until_limit(x)))

# Set pandas display options to show more characters
pd.set_option('display.max_colwidth', None)

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    display(row[['primaryCategories', 'reviews.rating', 'summary']])
    print("\n")


Using device: cuda
Summarization 1


  attn_output = torch.nn.functional.scaled_dot_product_attention(


NameError: name 'combined_summary' is not defined

## Trying depth for recursiveness, next step is adding dynamic summary_max_length 

In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category)] # & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8, recursion_depth=0, max_recursion_depth=5):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    combined_summary = ' '.join(chunk_summaries)
    
    if len(tokenizer.encode(combined_summary)) > max_chunk_length and recursion_depth < max_recursion_depth:
        print(f"Recursing at depth {recursion_depth} with length {len(tokenizer.encode(combined_summary))}")
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size, recursion_depth + 1, max_recursion_depth)
    else:
        final_summary = combined_summary
        print(f"Final summary length at depth {recursion_depth}: {len(tokenizer.encode(combined_summary))}")

    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    print(f"Summary: {row['summary']}\n")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews


## Trying dynamic summary max length!

In [1]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Updated function to split text into chunks based on maximum length
def split_into_chunks(text, max_chunk_length):
    tokens = tokenizer.encode(text)
    num_chunks = (len(tokens) + max_chunk_length - 1) // max_chunk_length
    chunk_size = (len(tokens) + num_chunks - 1) // num_chunks
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Adjusted function to dynamically calculate summary length
def calculate_summary_length(num_chunks, target_length=1024, initial_summary_length=150):
    if num_chunks > 1:
        return max(target_length // num_chunks, 50)  # Ensure a minimum summary length
    else:
        return initial_summary_length

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, target_summary_length=1024, initial_summary_length=150, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    summary_max_length = calculate_summary_length(len(chunks), target_summary_length, initial_summary_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=10,  # Adjusted min_length
            length_penalty=1.0,  # Adjust length penalty
            num_beams=4,  # Adjust number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries
    combined_summary = ' '.join(chunk_summaries)
    return combined_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Applied summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Set pandas display options to show more characters
pd.set_option('display.max_colwidth', None)

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    display(row[['primaryCategories', 'reviews.rating', 'summary']])
    print("\n")

print(f"Using device: {device}")



Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (309264 > 1024). Running this sequence through the model will result in indexing errors
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Primary Category: Electronics, Star Rating: 5
Original Length: 1390885 characters
Summary Length: 10356 characters


primaryCategories                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       



Using device: cuda


In [6]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Updated function to split text into chunks based on maximum length
def split_into_chunks(text, max_chunk_length):
    tokens = tokenizer.encode(text)
    num_chunks = (len(tokens) + max_chunk_length - 1) // max_chunk_length
    chunk_size = (len(tokens) + num_chunks - 1) // num_chunks
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)] # NOTE: Try to change this into the word adder from previous iterations
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Adjusted function to dynamically calculate summary length
def calculate_summary_length(num_chunks, target_length=1024, initial_summary_length=150):
    print ("Number of chunks", num_chunks)
    if num_chunks > 2:
        return max(target_length // num_chunks, 150)  # Ensure a minimum summary length
    elif num_chunks == 2:
        return max(target_length // 4)
    else:
        return initial_summary_length

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1000, target_summary_length=1024, initial_summary_length=150, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    summary_max_length = calculate_summary_length(len(chunks), target_summary_length, initial_summary_length)
    print("Summary max length: ", summary_max_length)
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_max_length//2,  # Adjusted min_length
            length_penalty=2.0,  # Adjust length penalty to encourage longer summaries
            num_beams=4,  # Adjust number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries
    combined_summary = ' '.join(chunk_summaries)
    
    # Remove repetitive phrases
    combined_summary = remove_repetitive_phrases(combined_summary)
    
    return combined_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Set pandas display options to show more characters
pd.set_option('display.max_colwidth', None)

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    display(row[['primaryCategories', 'reviews.rating', 'summary']])
    print("\n")

print(f"Using device: {device}")


Using device: cuda
Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (309264 > 1024). Running this sequence through the model will result in indexing errors


Primary Category: Electronics, Star Rating: 5
Original Length: 1390885 characters
Summary Length: 90158 characters


primaryCategories                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       



Using device: cuda


## Adding Recursivity, still not fixed maximum sequence length

In [10]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Updated function to split text into chunks based on maximum length
def split_into_chunks(text, max_chunk_length):
    tokens = tokenizer.encode(text)
    num_chunks = (len(tokens) + max_chunk_length - 1) // max_chunk_length
    chunk_size = (len(tokens) + num_chunks - 1) // num_chunks
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Adjusted function to dynamically calculate summary length
def calculate_summary_length(num_chunks, target_length=1024, initial_summary_length=150):
    print("Number of chunks:", num_chunks)
    if num_chunks > 2:
        return max(target_length // num_chunks, 150)  # Ensure a minimum summary length
    elif num_chunks == 2:
        return target_length // 2
    else:
        return initial_summary_length

# Function to summarize text by chunking with batch processing and recursion
def summarize_text(text, max_chunk_length=1000, target_summary_length=1024, initial_summary_length=150, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    summary_max_length = calculate_summary_length(len(chunks), target_summary_length, initial_summary_length)
    print("Summary max length:", summary_max_length)
    chunk_summaries = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_max_length // 2,
            length_penalty=1.0,
            num_beams=6,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    combined_summary = ' '.join(chunk_summaries)
    combined_summary = remove_repetitive_phrases(combined_summary)

    # Recursively summarize the combined summary if it still exceeds max_chunk_length
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        return summarize_text(combined_summary, max_chunk_length, target_summary_length, initial_summary_length, batch_size)
    else:
        return combined_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    # Split text based on multiple sentence delimiters
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        # Clean up and remove extra spaces or dots
        sentence = sentence.strip().strip('.').strip().strip('!').strip('?')
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences) + '.'

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Set pandas display options to show more characters
pd.set_option('display.max_colwidth', None)

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    display(row[['primaryCategories', 'reviews.rating', 'summary']])
    print("\n")

Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (309264 > 1024). Running this sequence through the model will result in indexing errors


Number of chunks: 310
Summary max length: 150
Number of chunks: 19
Summary max length: 150
Number of chunks: 2
Summary max length: 512
Primary Category: Electronics, Star Rating: 5
Original Length: 1390885 characters
Summary Length: 2204 characters


primaryCategories                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       





## Attempt at word per word addition AND chunk calculation with dynamic max_length

In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Updated function to split text into chunks based on maximum length
def split_into_chunks(text, max_chunk_length):
    words = text.split()
    tokens = tokenizer.encode(text)
    num_chunks = (len(tokens) + max_chunk_length - 1) // max_chunk_length
    chunk_size = (len(tokens) + num_chunks - 1) // num_chunks
    
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= chunk_size:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Function to dynamically calculate summary length
def calculate_summary_length(num_chunks, target_length=1024, initial_summary_length=150):
    print("Number of chunks:", num_chunks)
    if num_chunks > 2:
        return max(target_length // num_chunks, 150)  # Ensure a minimum summary length
    elif num_chunks == 2:
        return target_length // 2
    else:
        return initial_summary_length

# Function to summarize text by chunking with batch processing and recursion
def summarize_text(text, max_chunk_length=1000, target_summary_length=1024, initial_summary_length=150, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    summary_max_length = calculate_summary_length(len(chunks), target_summary_length, initial_summary_length)
    print("Summary max length:", summary_max_length)
    chunk_summaries = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_max_length // 2,
            length_penalty=1.0,
            num_beams=6,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    combined_summary = ' '.join(chunk_summaries)
    print(f"Combined summary length: {len(combined_summary)} characters")
    combined_summary = remove_repetitive_phrases(combined_summary)

    # Recursively summarize the combined summary if it still exceeds max_chunk_length
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        print("Recursively summarizing combined summary...")
        return summarize_text(combined_summary, max_chunk_length, target_summary_length, initial_summary_length, batch_size)
    else:
        return combined_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    # Split text based on multiple sentence delimiters
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        # Clean up and remove extra spaces or dots
        sentence = sentence.strip().strip('.').strip().strip('!').strip('?')
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences) + '.'

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Set pandas display options to show more characters
pd.set_option('display.max_colwidth', None)

# Display the summary and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    display(row[['primaryCategories', 'reviews.rating', 'summary']])
    print("\n")


Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (309264 > 1024). Running this sequence through the model will result in indexing errors


Number of chunks: 350
Summary max length: 150


## Going to sleep. I'll leave this running. It's supposed to generate all the summaries and save them in a .csv file

In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IPython.display import display

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Combined function to split text into chunks based on maximum length while respecting word boundaries
def split_into_chunks(text, max_chunk_length):
    words = text.split()
    tokens = tokenizer.encode(text)
    num_chunks = (len(tokens) + max_chunk_length - 1) // max_chunk_length
    chunk_size = (len(tokens) + num_chunks - 1) // num_chunks
    
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= chunk_size:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Function to dynamically calculate summary length
def calculate_summary_length(num_chunks, target_length=1024, initial_summary_length=150):
    print("Number of chunks:", num_chunks)
    if num_chunks > 2:
        return max(target_length // num_chunks, 150)  # Ensure a minimum summary length
    elif num_chunks == 2:
        return target_length // 2
    else:
        return initial_summary_length

# Function to summarize text by chunking with batch processing and recursion
def summarize_text(text, max_chunk_length=1000, target_summary_length=1024, initial_summary_length=150, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    summary_max_length = calculate_summary_length(len(chunks), target_summary_length, initial_summary_length)
    print("Summary max length:", summary_max_length)
    chunk_summaries = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_max_length // 2,
            length_penalty=2.0,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    combined_summary = ' '.join(chunk_summaries)
    print(f"Combined summary length: {len(combined_summary)} characters")
    combined_summary = remove_repetitive_phrases(combined_summary)

    # Recursively summarize the combined summary if it still exceeds max_chunk_length
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        print("Recursively summarizing combined summary...")
        return summarize_text(combined_summary, max_chunk_length, target_summary_length, initial_summary_length, batch_size)
    else:
        return combined_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    # Split text based on multiple sentence delimiters
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        # Clean up and remove extra spaces or dots
        sentence = sentence.strip().strip('.').strip().strip('!').strip('?')
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences) + '.'

# Apply summarization to the selected group of reviews and save the results
try:
    grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

    # Save to CSV
    output_path = Path('..') / 'data' / 'summarized_reviews_madhouse_complex.csv'
    grouped_reviews.to_csv(output_path, index=False)
    print(f"Summarized reviews saved to {output_path}")

    # Display the summary and check the lengths
    for idx, row in grouped_reviews.iterrows():
        original_length = len(row['reviews.text'])
        summary_length = len(row['summary'])
        print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
        print(f"Original Length: {original_length} characters")
        print(f"Summary Length: {summary_length} characters")
        display(row[['primaryCategories', 'reviews.rating', 'reviews.text', 'summary']])
        print("\n")
except Exception as e:
    print(f"An error occurred while processing or saving the data: {e}")



### Ill leave this running after the other one, too. It's the original one that worked. 

In [None]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)
"""
# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]
"""
# Concatenate all reviews
grouped_reviews = data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mabrouk/amazon-review-summarizer-bart")
model = AutoModelForSeq2SeqLM.from_pretrained("mabrouk/amazon-review-summarizer-bart")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Debugging: Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Clear the GPU cache
torch.cuda.empty_cache()

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=1024, summary_max_length=150, summary_min_length=30, batch_size=8):
    chunks = split_into_chunks(text, max_chunk_length)
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Visualize the reviews and their summaries and check the lengths
for idx, row in grouped_reviews.iterrows():
    original_length = len(row['reviews.text'])
    summary_length = len(row['summary'])
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Length: {original_length} characters")
    print(f"Summary Length: {summary_length} characters")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")