In [None]:
# Loading the libraries

import pandas as pd
import openai
import numpy as np
import time 
import pickle
import time

In [None]:
# Loading the data

df = pd.read_pickle(r"corpus_ccd_sumilla_2009_2024.pkl")
df = df.drop(columns=['Text'])
df_perpage = pd.read_pickle(r"corpus_ccd_per_page.pkl")
df_perpage['ID'] = df_perpage['ID'].str.replace('.pdf', '', regex=False)
df_perpage = df_perpage.merge(df, on='ID', how='inner')

In [None]:
# Processing the text for the embeddings

df_perpage['Text_for_embeddings'] = df_perpage['Resumen_GPT'] + ' ' + df_perpage['Text']
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].str.replace(r'[\n\r\t]', ' ', regex=True).str.strip()
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].str.replace(r'\s+', ' ', regex=True)
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].str.replace(r'\xa0', ' ', regex=True)
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].str.lower()
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].str.replace('*', '', regex=False)
df_perpage['Text_for_embeddings'] = df_perpage['Text_for_embeddings'].astype(str)

summaries = df_perpage['Text_for_embeddings'].tolist()  # Extract summaries for the embeddings
metadata = df_perpage[['Número de Resolución', 'Fecha de Resolución', 'Enlace']]   # Extract other metadata (e.g., ID, other columns)

In [None]:
# Explaining the function that generates embeddings for a list of input texts using the OpenAI API.

def get_embeddings(texts):
    '''
    Generates embeddings for a list of input texts using the OpenAI API, handling token limits, rate limits, and connection timeouts.

    Parameters:
    -----------
    texts : list of str
        A list of textual data (documents, sentences, etc.) to generate embeddings for.

    Returns:
    --------
    np.array
        A NumPy array containing the embeddings for each input text.

    Functionality:
    --------------
    1. Splits input texts into smaller chunks based on the specified token limit (`max_tokens`).
    2. Sends each chunk to the OpenAI API to get embeddings, using the model specified (`text-embedding-3-small`).
    3. Handles potential errors:
        - `RateLimitError`: If the API rate limit is exceeded, waits for a specified time (`rate_limit_wait_time`) and retries.
        - `APIConnectionError`: If there is a connection issue, waits for a specified time (`timeout_wait_time`) and retries.
        - Other exceptions are caught and raised.
    4. Combines the embeddings from all chunks into a single NumPy array and returns it.

    Error Handling:
    ---------------
    - RateLimitError: Prints a message and waits before retrying.
    - APIConnectionError: Prints a message and waits before retrying.
    - Any other error is raised after printing the error message.
    '''

    max_tokens = 8192  # Define max tokens per request
    rate_limit_wait_time = 60  # Time to wait in seconds if rate limit is hit
    timeout_wait_time = 20  # Time to wait in seconds if a timeout occurs


    def split_texts(texts, max_tokens):
        chunks = []
        current_chunk = []
        current_length = 0
        for text in texts:
            text_length = len(text.split())  # Word count
            if current_length + text_length > max_tokens:
                chunks.append(current_chunk)
                current_chunk = [text]
                current_length = text_length
            else:
                current_chunk.append(text)
                current_length += text_length
        if current_chunk:
            chunks.append(current_chunk)
        return chunks
    

    def get_embeddings_for_chunk(chunk):
        while True:
            try:
                response = openai.Embedding.create(
                    model="text-embedding-3-small",  # Choose appropriate model
                    input=chunk
                )
                return [data['embedding'] for data in response['data']]
            except openai.error.RateLimitError:
                print(f"Rate limit exceeded. Waiting for {rate_limit_wait_time} seconds...")
                time.sleep(rate_limit_wait_time)
            except openai.error.APIConnectionError:
                print(f"Connection timed out. Waiting for {timeout_wait_time} seconds...")
                time.sleep(timeout_wait_time)
            except Exception as e:
                print(f"An error occurred: {e}")
                raise


    # Split texts into chunks and get embeddings
    chunks = split_texts(texts, max_tokens)
    embeddings = []
    for chunk in chunks:
        embeddings.extend(get_embeddings_for_chunk(chunk))

    return np.array(embeddings)

In [None]:
embeddings_path = 'corpus_ccd_embeddings_full.pkl' # Save your embeddings here

print("Computing embeddings...")
summary_embeddings = get_embeddings(summaries)

    # Save the metadata, summaries, and embeddings using pickle
with open(embeddings_path, 'wb') as file:
    pickle.dump({
            'metadata': metadata,  # Save the metadata (e.g., IDs, other columns)
            'summaries': summaries,
            'summary_embeddings': summary_embeddings
        }, file)
    print("Embeddings, summaries, and metadata saved.")