In [1]:
import pandas as pd
import openai
import os
import json
import time
from dotenv import load_dotenv


In [2]:
# Load environment variables
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load your dataset
df = pd.read_csv("rt_reviews_cleaned_with_names.csv")

# Group reviews by movie
grouped_reviews = df.groupby('movie_title')['cleaned_review'].apply(list).reset_index()

In [3]:
# Get reviews for a specific movie
def split_reviews_into_batches(reviews, max_tokens=3000):
    batches = []
    current_batch = []
    current_length = 0
    
    for review in reviews:
        review_length = len(review.split())
        if current_length + review_length > max_tokens:
            batches.append(current_batch)
            current_batch = []
            current_length = 0
        current_batch.append(review)
        current_length += review_length
    
    if current_batch:
        batches.append(current_batch)
    
    return batches
# Summarize reviews for a specific movie
def summarize_reviews(reviews, batch_size=20):
    summaries = []
    review_batches = split_reviews_into_batches(reviews)

    for batch_reviews in review_batches:
        reviews_text = ' '.join(batch_reviews)

        # Use OpenAI API to summarize the reviews
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": (
                "Summarize the following movie reviews in no more than 200 words. "
                "First, state what most viewers are saying. "
                "Second, mention what some viewers are saying. "
                "Finally, provide the overall sentiment of the reviews:\n\n"
                f"{reviews_text}"
            )}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=400
        )
        
        summary = response.choices[0].message['content'].strip()
        summaries.append(summary)

        # Add a delay to avoid hitting the rate limit
        time.sleep(5)

    # Combine summaries of batches
    combined_summary = ' '.join(summaries)
    return combined_summary


In [4]:
# Create a dictionary to store summaries
movie_summaries = {}

# Iterate over each movie and summarize the reviews
for index, row in grouped_reviews.iterrows():
    movie_title = row['movie_title']
    reviews = row['cleaned_review']
    
    summary = summarize_reviews(reviews)
    movie_summaries[movie_title] = summary
    print(f"Summarized reviews for {movie_title}")

Summarized reviews for 10 Cloverfield Lane
Summarized reviews for 12 Years a Slave
Summarized reviews for 127 Hours
Summarized reviews for 13 Hours: The Secret Soldiers Of Benghazi
Summarized reviews for 1917
Summarized reviews for 2012
Summarized reviews for 20th Century Women
Summarized reviews for 21 Jump Street
Summarized reviews for 22 Jump Street
Summarized reviews for 28 Days Later
Summarized reviews for 300
Summarized reviews for 3:10 to Yuma
Summarized reviews for 500 Days of Summer
Summarized reviews for A Fantastic Woman (Una mujer fantástica)
Summarized reviews for A Ghost Story
Summarized reviews for A Good Day To Die Hard
Summarized reviews for A History of Violence
Summarized reviews for A Monster Calls
Summarized reviews for A Most Violent Year
Summarized reviews for A Nightmare on Elm Street
Summarized reviews for A Prairie Home Companion
Summarized reviews for A Quiet Place
Summarized reviews for A Scanner Darkly
Summarized reviews for A Serious Man
Summarized reviews

RateLimitError: Rate limit reached for gpt-4o in organization org-0gOo8Vma2agBJCZchIzlNXuR on tokens per min (TPM): Limit 30000, Used 24440, Requested 5732. Please try again in 344ms. Visit https://platform.openai.com/account/rate-limits to learn more.

In [5]:
# Save the summaries to a JSON file
with open('movie_summaries.json', 'w') as f:
    json.dump(movie_summaries, f)

print("Summarization complete. Summaries saved to movie_summaries.json")

Summarization complete. Summaries saved to movie_summaries.json
