In [None]:
import pandas as pd
import openai
import os
import json
import time
from dotenv import load_dotenv


In [None]:
# Load environment variables
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load your dataset
df = pd.read_csv("rt_reviews_cleaned_with_names.csv")

# Group reviews by movie
grouped_reviews = df.groupby('movie_title')['cleaned_review'].apply(list).reset_index()


In [None]:
def count_tokens(text):
    # Simple token count approximation
    return len(text.split())

def summarize_reviews(reviews):
    reviews_text = ' '.join(reviews)
    reviews_length = count_tokens(reviews_text)

    # Use OpenAI API to summarize the reviews
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": (
            "Summarize the following movie reviews in no more than 200 words. "
            "First, state what most viewers are saying. "
            "Second, mention what some viewers are saying. "
            "Third, provide the overall sentiment of the reviews. Be concise and avoid repetition."
            "Finally, provide a rating for each movie on a scale of 0-100:\n\n"
            f"{reviews_text}"
        )}
    ]

    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=messages,
                max_tokens=400
            )
            break  # Break out of the loop if the request is successful
        except openai.error.RateLimitError as e:
            # Extract the retry-after time from the error message and sleep for that duration
            retry_after = e.headers.get("Retry-After", 60)
            print(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
            time.sleep(float(retry_after))

    summary = response.choices[0].message['content'].strip()
    return summary, reviews_length

def manage_rate_limit(current_tokens, max_tokens_per_minute=25000):
    if current_tokens >= max_tokens_per_minute:
        sleep_time = 60  # Sleep for 60 seconds to reset the minute window
        print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
        time.sleep(sleep_time)
        current_tokens = 0  # Reset token counter after sleeping
    return current_tokens

In [None]:
# Create a dictionary to store summaries
movie_summaries = {}

# Token counter
current_tokens = 0

# Iterate over each movie and summarize the reviews
for index, row in grouped_reviews.iterrows():
    movie_title = row['movie_title']
    reviews = row['cleaned_review']
    
    summary, reviews_length = summarize_reviews(reviews)
    movie_summaries[movie_title] = summary
    print(f"Summarized reviews for {movie_title}")

    # Update the token counter and manage rate limits
    current_tokens += reviews_length + 400  # Add the length of the reviews and the max_tokens used for the summary
    current_tokens = manage_rate_limit(current_tokens)

In [None]:
# Save the summaries to a JSON file
with open('movie_summaries_final_2.json', 'w') as f:
    json.dump(movie_summaries, f)

print("Summarization complete. Summaries saved to movie_summaries.json")