In [1]:
import pandas as pd
import openai
import os
import json
import time
from dotenv import load_dotenv


In [2]:
# Load environment variables
load_dotenv()

# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load your dataset
df = pd.read_csv("rt_reviews_with_names_converted_newreviewscore.csv")

# Convert the movie_title and cleaned_review to string
df['movie_title'] = df['movie_title'].astype(str)
df['review_content'] = df['review_content'].astype(str)

# Drop movies with less than x reviews
df = df.groupby('movie_title').filter(lambda x: len(x) >= 370)

# print how many movies are left
print(len(df['movie_title'].unique()))

# Group reviews by movie
grouped_reviews = df.groupby('movie_title')['review_content'].apply(list).reset_index()


214


In [3]:
def count_tokens(text):
    # Simple token count approximation
    return len(text.split())

def summarize_reviews(reviews):
    reviews_text = ' '.join(reviews)
    reviews_length = count_tokens(reviews_text)

    # Use OpenAI API to summarize the reviews
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": (
            "Summarize the following movie reviews in no more than 200 words. "
            "First, state what most viewers are saying. "
            "Second, mention what some viewers are saying. "
            "Third, provide the overall sentiment of the reviews. Be concise and avoid repetition."
            "Finally, provide a rating for each movie on a scale of 0-100:\n\n"
            f"{reviews_text}"
        )}
    ]

    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=messages,
                max_tokens=400
            )
            break  # Break out of the loop if the request is successful
        except openai.error.RateLimitError as e:
            # Extract the retry-after time from the error message and sleep for that duration
            retry_after = e.headers.get("Retry-After", 60)
            print(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
            time.sleep(float(retry_after))

    summary = response.choices[0].message['content'].strip()
    return summary, reviews_length

def manage_rate_limit(current_tokens, max_tokens_per_minute=25000):
    if current_tokens >= max_tokens_per_minute:
        sleep_time = 60  # Sleep for 60 seconds to reset the minute window
        print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
        time.sleep(sleep_time)
        current_tokens = 0  # Reset token counter after sleeping
    return current_tokens

In [4]:
# Create a dictionary to store summaries
movie_summaries = {}

# Token counter
current_tokens = 0

# Iterate over each movie and summarize the reviews
for index, row in grouped_reviews.iterrows():
    movie_title = row['movie_title']
    reviews = row['review_content']
    
    summary, reviews_length = summarize_reviews(reviews)
    movie_summaries[movie_title] = summary
    print(f"Summarized reviews for {movie_title}")

    # Update the token counter and manage rate limits
    current_tokens += reviews_length + 400  # Add the length of the reviews and the max_tokens used for the summary
    current_tokens = manage_rate_limit(current_tokens)

Summarized reviews for 1917
Summarized reviews for A Prairie Home Companion
Rate limit exceeded. Retrying after 8 seconds.
Summarized reviews for A Quiet Place
Rate limit exceeded. Sleeping for 60 seconds.
Summarized reviews for A Scanner Darkly
Summarized reviews for A Star Is Born
Rate limit exceeded. Retrying after 12 seconds.
Summarized reviews for Ad Astra
Rate limit exceeded. Sleeping for 60 seconds.
Summarized reviews for Aladdin
Summarized reviews for Alien: Covenant
Rate limit exceeded. Retrying after 8 seconds.
Summarized reviews for Ant-Man and the Wasp
Rate limit exceeded. Sleeping for 60 seconds.
Summarized reviews for Arrival
Summarized reviews for Avengers: Endgame
Rate limit exceeded. Retrying after 19 seconds.
Summarized reviews for Avengers: Infinity War
Rate limit exceeded. Sleeping for 60 seconds.
Summarized reviews for Baby Driver
Summarized reviews for Batman v Superman: Dawn of Justice
Rate limit exceeded. Retrying after 13 seconds.
Summarized reviews for Beauty 

APIError: The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_3b6eaaae5da1585a5f02e17cfeea1517 in your email.) {
  "error": {
    "message": "The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_3b6eaaae5da1585a5f02e17cfeea1517 in your email.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
} 500 {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_3b6eaaae5da1585a5f02e17cfeea1517 in your email.)', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Wed, 22 May 2024 02:27:45 GMT', 'Content-Type': 'application/json', 'Content-Length': '369', 'Connection': 'keep-alive', 'openai-processing-ms': '3847', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '500', 'x-ratelimit-limit-tokens': '30000', 'x-ratelimit-remaining-requests': '499', 'x-ratelimit-remaining-tokens': '4863', 'x-ratelimit-reset-requests': '120ms', 'x-ratelimit-reset-tokens': '50.273s', 'x-request-id': 'req_3b6eaaae5da1585a5f02e17cfeea1517', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '88795e559b89530a-SLC', 'alt-svc': 'h3=":443"; ma=86400'}

In [5]:
# Save the summaries to a JSON file
with open('movie_summaries_final_2.json', 'w') as f:
    json.dump(movie_summaries, f)

print("Summarization complete. Summaries saved to movie_summaries.json")

Summarization complete. Summaries saved to movie_summaries.json
