# Load Dependencies

In [None]:
# Base libraries
import os
import pandas as pd
import numpy as np
import regex as re
import time
from dotenv import load_dotenv

# Classification libraries
from openai import OpenAI, RateLimitError, APIError, Timeout, APIConnectionError

# Retry libraries
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from collections import deque

# Classificator configuration

## GPT configuration

In [None]:
load_dotenv()

api_key = os.getenv("OPENAI4_API_KEY")
if api_key is None:
    raise ValueError("OPENAI4_API_KEY not found in environment variables")
client = OpenAI(api_key=api_key)

# Classification Function
@retry(
    retry=retry_if_exception_type((RateLimitError, APIError, Timeout, APIConnectionError)),
    wait=wait_random_exponential(min=5, max=60),
    stop=stop_after_attempt(6)
)
def openAI_classificator(context, message, model="gpt-4.1-2025-04-14"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": context
                },
                {
                    "role": "user",
                    "content": message
                }
            ],
            max_tokens=10, 
            temperature=0.0,
            top_p=0.9
        )

        return response.choices[0].message.content, response.usage.total_tokens
    
    except Exception as e:
        print(e)
        match = re.search(r"Requested (\d+)\.", str(e))
        if match:
            requested_tokens = int(match.group(1))
        else:
            requested_tokens = 0
        
        return np.nan, requested_tokens

## Token limit config

In [None]:
# Max token per minute 
MAX_TOKENS_PER_MINUTE = 450000 # Rate limit tier 2 -> 450k tokens/minute in model gpt-4.1-2025-04-14

# Save (timestamp, tokens) of each request in a deque
token_window = deque()

def clean_token_window():
    # Delete the values older than 60 seconds
    current_time = time.time()
    while token_window and (current_time - token_window[0][0]) > 60:
        token_window.popleft()

def wait_if_needed(new_tokens):
    clean_token_window()
    current_tokens = sum(tokens for _, tokens in token_window)

    if current_tokens + new_tokens > MAX_TOKENS_PER_MINUTE:
        excess = (current_tokens + new_tokens) - MAX_TOKENS_PER_MINUTE
        print(f"[WAIT] amount of tokens exceded ({excess}). Waiting...")

        # Calculate the time to wait based on the excess tokens
        time_to_wait = 60 - (time.time() - token_window[0][0])
        time.sleep(time_to_wait)
        clean_token_window()

# Context and subject definition

In [None]:
context = "Your context goes here"

In [None]:
subject = "Your subject goes here"

# Classification

## Load file

In [None]:
fileV = "your_file.csv"
dfV = pd.read_csv(fileV, dtype={"classification": str, "polarity":str})
dfV.info()

## Prompt definition

In [None]:
def video_prompt(context,subject):
 return f""" 
    Contexto: {context}
    
    Instrucción: Clasifica el siguiente resumen en dos escalas, la primera es la de Likert y la segunda es la de polaridad. Para la escala Likert
    ten en cuenta las siguientes opciones: 1:'Completamente en desacuerdo', 2:'En desacuerdo', 3:'Ni de acuerdo ni en desacuerdo', 4:'De acuerdo', 5:'Completamente de acuerdo'.
    Para la escala de polaridad ten en cuenta las siguientes opciones: 1:'Negativo', 2:'Neutro', 3:'Positivo'. En ambos casos el resumen se debe
    clasificar en relación con la siguiente afirmación o pregunta: \"{subject}\".

    Solo responde con una de las etiquetas mencionadas (solo el número) sin ningún texto adicional. Por ejemplo "1,2" o "3,1" o "5,3", donde el primer numero sea
    la escala Likert y el segundo la escala de polaridad.
    """

## Classification

In [None]:
tokens = 0

for i,row in dfV.iterrows():
    if pd.notnull(dfV.at[i, 'classification']):
        continue
    
    if i % 10 == 0:
        print(f"Processing row {i}/{len(dfV)}")

    if i % 20 == 0:
        # Save the dataframe every 20 rows
        dfV.to_csv(fileV, index=False)

    resumen = row['summary_Sumy']
    prompt = video_prompt(context,subject)

    try:
        # Check if is needed to wait
        estimated_tokens = 1500  # This value can be adjusted (an estimated in this case is 650, but we add 150 for safety)
        wait_if_needed(estimated_tokens)

        response = openAI_classificator(prompt, resumen)
        classification, polarity = response[0].split(",")
        used_tokens = response[1]

        # Save the tokens used
        dfV.at[i, 'classification'] = classification
        dfV.at[i, 'polarity'] = polarity

        # Update the token window
        token_window.append((time.time(), used_tokens))

    except Exception as e:
        print(f"Error en la fila {i}: {e}")
        dfV.at[i, 'classification'] = 'error'
        dfV.at[i, 'polarity'] = 'error'

    if i == len(dfV):
            # Save the dataframe at the end
            dfV.to_csv(fileV, index=False)

dfV.info()