In [1]:
import pandas as pd
import openai
import os
import json
import time
from dotenv import load_dotenv


In [2]:
# Load environment variables
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load your dataset
df = pd.read_csv("rt_reviews_cleaned_with_names.csv")

# Group reviews by movie
grouped_reviews = df.groupby('movie_title')['cleaned_review'].apply(list).reset_index()

In [3]:
# Get reviews for a specific movie
def split_reviews_into_batches(reviews, max_tokens=3000):
    batches = []
    current_batch = []
    current_length = 0
    
    for review in reviews:
        review_length = len(review.split())
        if current_length + review_length > max_tokens:
            batches.append(current_batch)
            current_batch = []
            current_length = 0
        current_batch.append(review)
        current_length += review_length
    
    if current_batch:
        batches.append(current_batch)
    
    return batches
# Summarize reviews for a specific movie
def summarize_reviews(reviews, batch_size=20):
    summaries = []
    review_batches = split_reviews_into_batches(reviews)

    for batch_reviews in review_batches:
        reviews_text = ' '.join(batch_reviews)

        # Use OpenAI API to summarize the reviews
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": (
                "Summarize the following movie reviews in no more than 200 words. "
                "First, state what most viewers are saying. "
                "Second, mention what some viewers are saying. "
                "Finally, provide the overall sentiment of the reviews:\n\n"
                f"{reviews_text}"
            )}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=400
        )
        
        summary = response.choices[0].message['content'].strip()
        summaries.append(summary)

        # Add a delay to avoid hitting the rate limit
        time.sleep(1)

    # Combine summaries of batches
    combined_summary = ' '.join(summaries)
    return combined_summary


In [4]:
# Create a dictionary to store summaries
movie_summaries = {}

# Iterate over each movie and summarize the reviews
for index, row in grouped_reviews.iterrows():
    movie_title = row['movie_title']
    reviews = row['cleaned_review']
    
    summary = summarize_reviews(reviews)
    movie_summaries[movie_title] = summary
    print(f"Summarized reviews for {movie_title}")

Summarized reviews for 10 Cloverfield Lane


KeyboardInterrupt: 

In [None]:
# Save the summaries to a JSON file
with open('movie_summaries.json', 'w') as f:
    json.dump(movie_summaries, f)

print("Summarization complete. Summaries saved to movie_summaries.json")