In [2]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoTokenizer, LEDForConditionalGeneration

# Load the dataset
current_dir = Path.cwd()
data_path = (current_dir / '..' / 'data' / 'data.csv').resolve()

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Group reviews by primary category and star rating and concatenate all reviews within each group
grouped_reviews = data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(device)

# Function to summarize text, handling long texts by chunking
def summarize_text(text, max_chunk_length=4096, summary_max_length=150, summary_min_length=30):
    # Split the text into chunks that fit within the model's maximum input length
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_chunk_length)
    chunks = []
    chunk_size = max_chunk_length
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i + chunk_size]
        if len(chunk) > 0:
            chunks.append(chunk)
    
    # Summarize each chunk
    chunk_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs['input_ids'], max_length=summary_max_length, min_length=summary_min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        chunk_summaries.append(summary)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length)
    else:
        final_summary = combined_summary
    
    return final_summary

# Apply summarization to each group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: summarize_text(x))

# Display the summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:700]}...")  # Print only the first 700 characters of the original reviews
    print(f"Summary: {row['summary']}\n")


FileNotFoundError: The file at C:\Users\Pedro\Documents\Ironhack\SixthWeek\data\data.csv does not exist.

In [1]:
# Save the summarized reviews to a CSV file
output_path = current_dir / 'summarized_reviews.csv'
grouped_reviews.to_csv(output_path, index=False)
print(f"Summarized reviews saved to {output_path}")

NameError: name 'current_dir' is not defined