In [1]:
# install and import these model
! pip install openai
! pip install ratelimit
! pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-macosx_11_0_arm64.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [3]:
import re
import openai
import time
import os
import json
import pandas as pd
import tiktoken
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

print("Modules imported successfully.")

Modules imported successfully.


In [4]:
#call this model with your key
from openai import OpenAI
client=OpenAI(
    api_key="***************************"
)

In [5]:
#import the dataset
import nltk
rows = []
for fileid in nltk.corpus.movie_reviews.fileids():
    rows.append({"text": nltk.corpus.movie_reviews.raw(fileid)})
df = pd.DataFrame(rows)
df = df.head(50) #note we subset the original data to use less token
print(len(df))
print(df.head())

50
                                                text
0  plot : two teen couples go to a church party ,...
1  the happy bastard's quick movie review \ndamn ...
2  it is movies like these that make a jaded movi...
3   " quest for camelot " is warner bros . ' firs...
4  synopsis : a mentally unstable man undergoing ...


In [56]:
#Implementing a Simple Memoization Decorator in Python
from functools import wraps
import sys

def memoize(func):
  cache={}

  @wraps(func)
  def wrapper(*args, **kwargs):
    key = str(args) + str(kwargs)

    if key not in cache:
      cache[key] = func(*args, **kwargs)

    return cache[key]

  return wrapper

In [70]:
# Define the movie review topics
topics = {
     "Action_Adventure": ["action", "adventure"],
    "Drama": ["drama"],
    "Comedy": ["comedy", "funny"],
    "Horror_Suspense": ["horror", "suspense"],
    "Animated_Animation": ["animated", "animation"],
    "SciFi_Alien": ["sci fi", "alien"],
    "Romance_Romantic": ["romance", "romantic"],
    "Fantasy": ["fantasy"]
}

In [90]:
# Simplified prompt function for movie reviews
@memoize
def make_prompt(response):
    prompt = f"""
    You are an assistant analyzing movie reviews. Assign percentage weights (probability distribution) to the following movie genres based on their presence in the review. Ensure the probabilities sum to 1.

    ### Movie Genres:

    1. **Action_Adventure**: Includes action-packed scenes, thrill, and adventure.
       - Keywords: {', '.join(topics['Action_Adventure'])}

    2. **Drama**: Emotional storytelling and intense character development.
       - Keywords: {', '.join(topics['Drama'])}

    3. **Comedy**: Humorous content intended to make the audience laugh.
       - Keywords: {', '.join(topics['Comedy'])}

    4. **Horror_Suspense**: Frightening elements and suspenseful moments.
       - Keywords: {', '.join(topics['Horror_Suspense'])}

    5. **Animated_Animation**: Features animated content, often targeted towards a younger audience.
       - Keywords: {', '.join(topics['Animated_Animation'])}

    6. **SciFi_Alien**: Science fiction themes, often involving futuristic elements or alien encounters.
       - Keywords: {', '.join(topics['SciFi_Alien'])}

    7. **Romance_Romantic**: Focuses on love stories and romantic relationships.
       - Keywords: {', '.join(topics['Romance_Romantic'])}

    8. **Fantasy**: Elements of magic, mythical creatures, and otherworldly settings.
       - Keywords: {', '.join(topics['Fantasy'])}

    ### Task:

    Analyze the following movie review and assign percentage weights to each genre based on its presence. Ensure the total equals 1.

    **Review:**
    "{response}"

    ### Output Format (include only this):
    
    Example: "This movie is a thrilling adventure with action-packed scenes and a touch of romance."

    - Action_Adventure: 0.70
    - Drama: 0.00
    - Comedy: 0.00
    - Horror_Suspense: 0.00
    - Animated_Animation: 0.00
    - SciFi_Alien: 0.00
    - Romance_Romantic: 0.30
    - Fantasy: 0.00
    """
    return prompt

In [91]:
instance ="This movie was an exhilarating ride from start to finish. The action scenes were intense, and the adventure kept me on the edge of my seat. There were also some light-hearted moments that added a bit of comedy, which balanced the suspenseful plot. The romantic subplot, though not central, added a nice touch to the overall story. The special effects were fantastic, especially in the scenes involving the alien invasion. It’s a must-watch for anyone who loves action-packed science fiction with a hint of romance and comedy."
instance

'This movie was an exhilarating ride from start to finish. The action scenes were intense, and the adventure kept me on the edge of my seat. There were also some light-hearted moments that added a bit of comedy, which balanced the suspenseful plot. The romantic subplot, though not central, added a nice touch to the overall story. The special effects were fantastic, especially in the scenes involving the alien invasion. It’s a must-watch for anyone who loves action-packed science fiction with a hint of romance and comedy.'

In [92]:
make_prompt(response=instance)

'\n    You are an assistant analyzing movie reviews. Assign percentage weights (probability distribution) to the following movie genres based on their presence in the review. Ensure the probabilities sum to 1.\n\n    ### Movie Genres:\n\n    1. **Action_Adventure**: Includes action-packed scenes, thrill, and adventure.\n       - Keywords: action, adventure\n\n    2. **Drama**: Emotional storytelling and intense character development.\n       - Keywords: drama\n\n    3. **Comedy**: Humorous content intended to make the audience laugh.\n       - Keywords: comedy, funny\n\n    4. **Horror_Suspense**: Frightening elements and suspenseful moments.\n       - Keywords: horror, suspense\n\n    5. **Animated_Animation**: Features animated content, often targeted towards a younger audience.\n       - Keywords: animated, animation\n\n    6. **SciFi_Alien**: Science fiction themes, often involving futuristic elements or alien encounters.\n       - Keywords: sci fi, alien\n\n    7. **Romance_Romant

In [93]:
prompt = make_prompt(response=instance)

In [94]:
prompt_template="You are a helpful, respectful and honest assistant, expert in text analysis of movie reviews. Always answer precisely"


In [95]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": prompt_template},
        {"role": "user", "content": prompt}],
    temperature=0,
    max_tokens=200  
)

print(completion.choices[0].message)

ChatCompletionMessage(content='- Action_Adventure: 0.40\n- Drama: 0.00\n- Comedy: 0.20\n- Horror_Suspense: 0.00\n- Animated_Animation: 0.00\n- SciFi_Alien: 0.30\n- Romance_Romantic: 0.10\n- Fantasy: 0.00', role='assistant', function_call=None, tool_calls=None, refusal=None)


In [77]:
print(completion.choices[0].message.content)

- Action_Adventure: 0.40
- Drama: 0.00
- Comedy: 0.20
- Horror_Suspense: 0.00
- Animated_Animation: 0.00
- SciFi_Alien: 0.30
- Romance_Romantic: 0.10
- Fantasy: 0.00


In [101]:
# Initialize tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Define Functions for API Call
def get_weights(response):
    prompt = make_prompt(response)
    try:
        api_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": prompt_template},
                {"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=200
        )
        return api_response.choices[0].message.content.strip(), len(tokenizer.encode(prompt)) + 200
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}. Waiting and retrying...")
        time.sleep(60)  # Wait for 60 seconds before retrying
        return get_weights(response)
    except Exception as e:
        print(f"Error: {e}")
        return None, 0

# Function to get the probabilities
def parse_weights(response_text):
    pattern = r"(\w+): (\d\.\d+)"
    matches = re.findall(pattern, response_text)
    weights = {topic: float(weight) for topic, weight in matches if topic in topics}
    
    # Normalize weights to sum to 1
    total_weight = sum(weights.values())
    if total_weight != 0:
        weights = {topic: weight / total_weight for topic, weight in weights.items()}
    else:
        # If total weight is 0, assign equal weight to each topic
        weights = {topic: 1.0 / len(topics) for topic in topics}

    # Ensure all topics are included
    for topic in topics:
        if topic not in weights:
            weights[topic] = 0.0

    return weights


In [113]:
#a system for processing text data in batches while managing rate limits and saving checkpoints
#not needed for this data subset but useful when working with big datasets

# Function to save checkpoint
def save_checkpoint(checkpoint_file, last_processed_index):
    with open(checkpoint_file, 'w') as f:
        json.dump({'last_processed_index': last_processed_index}, f)

# Function to load checkpoint
def load_checkpoint(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
            return checkpoint.get('last_processed_index', -1)
    return -1

# Function to save intermediate results
def save_intermediate_results(results, file_path):
    results_df = pd.DataFrame(results)
    results_df.to_csv(file_path, index=False)

# Function to load intermediate results
def load_intermediate_results(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path).to_dict('records')
    return []

# Function to process the responses in batches with checkpoint and rate limit handling
def process_responses(data, batch_size=100, rate_limit=219, daily_limit=1905000, checkpoint_file='checkpoint.json', results_file='intermediate_results.csv'):
    while True:
        # Load checkpoint
        last_processed_index = load_checkpoint(checkpoint_file)
        start_index = last_processed_index + 1
        print(f"Starting processing from index: {start_index}")  # Debugging statement

        # Load intermediate results
        results = load_intermediate_results(results_file)
        total_tokens_used = 0
        batch_count = 0

        # Iterate over the data in batches
        for start_index in range(start_index, len(data), batch_size):
            end_index = min(start_index + batch_size, len(data))
            batch_data = data.iloc[start_index:end_index]

            batch_results = []
            print(f"Processing batch from {start_index} to {end_index}")  # Debugging statement
            for index, row in batch_data.iterrows():
                response = row['text']
                weights_text, tokens_used = get_weights(response)
                total_tokens_used += tokens_used
                if weights_text:
                    weights = parse_weights(weights_text)
                    batch_results.append(weights)
                else:
                    batch_results.append({topic: None for topic, _ in weights.items()})

                batch_count += 1
                if batch_count >= rate_limit:
                    print("Reached rate limit. Waiting for 1 minute.")
                    time.sleep(60)
                    batch_count = 0

                time.sleep(1)  # Small delay to avoid hitting rate limits

                if total_tokens_used >= daily_limit:
                    print("Reached daily token limit. Waiting for 1 hour.")
                    save_checkpoint(checkpoint_file, index)  
                    results.extend(batch_results)
                    save_intermediate_results(results, results_file)
                    print(f"Checkpoint saved at index: {index}")  # Debugging statement
                    time.sleep(3600)  # Wait for one hour
                    break  # Breaks out of the inner loop to reload checkpoint and continue
            else:
                # This else is executed if the inner loop does not encounter a break
                results.extend(batch_results)
                save_checkpoint(checkpoint_file, end_index - 1)
                save_intermediate_results(results, results_file)
                print(f"Checkpoint saved at index: {end_index - 1}")  # Debugging statement
                continue  # Continue with the next iteration of the outer loop
            
            break  # Break the outer loop if inner loop breaks due to daily limit

        else:
            # This else is executed if the outer loop does not encounter a break
            break  # Break the while loop if all data is processed

    return pd.DataFrame(results)


In [1]:
def merge_results(data, results):
    results_df = pd.DataFrame(results)
    # Adjust the length of results_df to match data length
    results_df.index = data.index[:len(results_df) 
    # Combine the original data with the filtered results
    return pd.concat([data.iloc[:len(results_df)], results_df], axis=1)


In [121]:
# Process the dataset
results_data = process_responses(df, batch_size=50, checkpoint_file='checkpoint_q1.json', results_file='intermediate_results_q1.csv')
results_data = results_data.iloc[:, :-2]

# Combine the results with the original responses
final_data = merge_results(df, results_data)

# Save to a CSV file with error handling
output_file = './gptprobs_movie_review.xlsx'
try:
    final_data.to_excel(output_file, index=False)
    print(f"Results successfully saved to {output_file}")
except PermissionError:
    print(f"Permission denied: Unable to save file to {output_file}. Please close the file if it is open and try again.")
except Exception as e:
    print(f"An error occurred while saving the file: {e}")
print(final_data.shape[0])
final_data.head(10)
  

Starting processing from index: 2000
An error occurred while saving the file: it was once said that in order to truly enjoy some of todays movies or novels , you must suspend disbelief .  
however , there is a distinct separation between opening your mind to ridiculous situations and believing a bunch of flat lies . 
it came as no surprise to me to learn that huntingburg , indiana ( where the film is set ) , does not have a mcdonald's , a sears store , a statue of a man on a horse , nor even a dam close by . 
even if you can ignore these somewhat white lies ( even though the dam is crucial to hard rains plot ) ; there is still a bunch of things that just dont gel . 
firstly , morgan freeman , possibly the greatest actor alive ( after such films as driving miss daisy , the shawshank redemption and se7en ) , was cast as a shifty goon intent on stealing enough money for a healthy retirement . 
he _should_ have played the town sheriff , an experienced and somewhat intriguing policem

Unnamed: 0,text,Action_Adventure,Drama,Comedy,Horror_Suspense,Animated_Animation,SciFi_Alien,Romance_Romantic,Fantasy
0,"plot : two teen couples go to a church party ,...",0.2,0.3,0.0,0.1,0.0,0.0,0.0,0.4
1,the happy bastard's quick movie review \ndamn ...,0.25,0.1,0.0,0.3,0.0,0.2,0.0,0.15
2,it is movies like these that make a jaded movi...,0.4,0.3,0.1,0.0,0.0,0.0,0.0,0.2
3,""" quest for camelot "" is warner bros . ' firs...",0.2,0.1,0.15,0.0,0.5,0.0,0.05,0.0
4,synopsis : a mentally unstable man undergoing ...,0.0,0.2,0.0,0.5,0.0,0.0,0.3,0.0
5,capsule : in 2176 on the planet mars police ta...,0.4,0.1,0.0,0.3,0.0,0.2,0.0,0.0
6,"so ask yourself what "" 8mm "" ( "" eight millime...",0.1,0.4,0.0,0.3,0.0,0.0,0.2,0.0
7,that's exactly how long the movie felt to me ....,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,call it a road trip for the walking wounded . ...,0.0,0.8,0.0,0.0,0.0,0.0,0.2,0.0
9,plot : a young french boy sees his parents kil...,0.2,0.15,0.05,0.0,0.0,0.0,0.1,0.0


In [120]:
#print the final output
results_data

Unnamed: 0,Action_Adventure,Drama,Comedy,Horror_Suspense,Animated_Animation,SciFi_Alien,Romance_Romantic,Fantasy
0,0.20,0.300,0.00,0.10,0.0,0.0,0.000,0.40
1,0.25,0.100,0.00,0.30,0.0,0.2,0.000,0.15
2,0.40,0.300,0.10,0.00,0.0,0.0,0.000,0.20
3,0.20,0.100,0.15,0.00,0.5,0.0,0.050,0.00
4,0.00,0.200,0.00,0.50,0.0,0.0,0.300,0.00
...,...,...,...,...,...,...,...,...
1995,0.00,0.200,0.50,0.00,0.0,0.1,0.100,0.10
1996,0.00,0.875,0.00,0.00,0.0,0.0,0.125,0.00
1997,0.20,0.500,0.00,0.10,0.0,0.0,0.000,0.20
1998,0.60,0.300,0.05,0.05,0.0,0.0,0.000,0.00
