In [7]:
from bs4 import BeautifulSoup
import io
import tiktoken
import os
from openai import OpenAI

In [8]:
def extract_text(filename):
    f = io.open(filename, mode="r", encoding="utf-8")
    html_doc = f.read()
    f.close()
    soup = BeautifulSoup(html_doc, 'html.parser')
    text = soup.get_text()
    return text

In [9]:
os.environ["OPENAI_API_KEY"] = extract_text("apikey.txt")
client = OpenAI()

In [10]:
def chunking(text, max_token_count):
    enc = tiktoken.get_encoding("cl100k_base")

    words = text.split()
    chunk = []
    encodeStr = ""
    token_count = 0
    
    
    for word in words:
        if token_count+len(enc.encode(word))>max_token_count:
            chunk.append(encodeStr.strip())
            encodeStr = ""
            token_count = 0
        else:
            encodeStr = encodeStr + " " + word
            token_count += len(enc.encode(word))
    
    if encodeStr:        
        chunk.append(encodeStr.strip())
        
    return chunk

In [11]:
def extract_and_chunk_text(directory, max_token_count):
    results = {}
    
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.html'):
                file_path = os.path.join(dirpath, filename)
                text = extract_text(file_path)
                chunks = chunking(text, max_tokens)
                results[file_path] = chunks
                
                
    return results

In [12]:
def calculate_embeddings_of_chunk_batch(batch_chunk,EMBEDDING_MODEL):
    print("Calculating embedding for batch #", x)
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=batch_chunk,
        api_key = os.getenv("OPENAI_API_KEY")
    )

    for i, embedding in enumerate(response["data"]):
        assert i == embedding["index"]

    return [e["embedding"] for e in response["data"]]

In [13]:
def calculate_embeddings_for_dict(chunks_dict):
    """
    Calculate embeddings for a dictionary where each key is a file path and the corresponding value is a list of text chunks.
    
    Parameters:
    - chunks_dict (dict): Dictionary with file paths as keys and lists of text chunks as values.
    
    Returns:
    - Dictionary with file paths as keys and lists of embeddings as values.
    """

    EMBEDDING_MODEL = "text-embedding-ada-002"
    BATCH_SIZE = 20000
    results = {}
    
    for path, chunks in chunks_dict.items():
        embeddings = []
        for x in range(BATCH_SIZE):
            start = x * BATCH_SIZE
            end = start + BATCH_SIZE
            batch = chunks[start:end]

            embeddings.extend(calculate_embeddings_of_chunk_batch(batch,EMBEDDING_MODEL))
            
        results[path] = embeddings
        
    return results

In [14]:
def get_embeddings_for_chunks(chunks):
    """
    Calculate embeddings for a list of text chunks.
    
    Parameters:
    - chunks (list): List of text chunks.
    
    Returns:
    - List of embeddings for each chunk.
    """

    EMBEDDING_MODEL = "text-embedding-ada-002"
    BATCH_SIZE = 20000
    results = []
    
    for chunk in chunks:
        embeddings = []
        for x in range(0,len(chunks),BATCH_SIZE):
            start = x
            end = start + BATCH_SIZE
            batch = chunk[start:end]

            embeddings.extend(calculate_embeddings_of_chunk_batch(batch,EMBEDDING_MODEL))
        results.append(embeddings)
    return results

In [15]:
def process_directory_and_get_embeddings(directory, max_tokens=4096):
    all_chunks = []
    file_paths = []
    
    file_paths = [file for dirs in os.walk(directory, topdown=True)
                     for file in dirs[2] if file.endswith(".html")]
    
    for file in file_paths:
        text = extract_text(file)
        chunks = chunking(text, max_tokens)
        all_chunks.extend(chunks)
    
    embeddings = get_embeddings_for_chunks(all_chunks)
    
    return pd.DataFrame({'file_path':file_path, 'text':all_chunks, 'embeddings': embeddings})