In [4]:
from bs4 import BeautifulSoup
import io
import tiktoken
import os
import openai

In [6]:
def extract_text(filename):
    f = io.open(filename, mode="r", encoding="utf-8")
    html_doc = f.read()
    f.close()
    soup = BeautifulSoup(html_doc, 'html.parser')
    text = soup.get_text()
    return text

In [None]:
def chunking(text, max_token_count):
    enc = tiktoken.get_encoding("cl100k_base")

    words = text.split()
    chunk = []
    encodeStr = ""
    token_count = 0
    
    
    for word in words:
        if token_count+len(enc.encode(word))>max_token_count:
            chunk.append(encodeStr.strip())
            encodeStr = ""
            token_count = 0
        else:
            encodeStr = encodeStr + " " + word
            token_count += len(enc.encode(word))
    
    if encodeStr:        
        chunk.append(encodeStr.strip())
        
    return chunk

In [None]:
def extract_and_chunk_text(directory, max_token_count):
    results = {}
    
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.html'):
                file_path = os.path.join(dirpath, filename)
                text = extract_text(file_path)
                chunks = chunking(text, max_tokens)
                results[file_path] = chunks
                
                
    return results

In [None]:
def calculate_embeddings_for_dict(chunks_dict):
    """
    Calculate embeddings for a dictionary where each key is a file path and the corresponding value is a list of text chunks.
    
    Parameters:
    - chunks_dict (dict): Dictionary with file paths as keys and lists of text chunks as values.
    
    Returns:
    - Dictionary with file paths as keys and lists of embeddings as values.
    """

    EMBEDDING_MODEL = "text-embedding-ada-002"
    BATCH_SIZE = 20000
    results = {}
    
    os.environ["OPENAI_API_KEY"] = extract_text("apikey.txt")
    client = OpenAI()
    
    for path, chunks in chunks_dict.items():
        embeddings = []
        for x in range(BATCH_SIZE):
            start = x * BATCH_SIZE
            end = start + BATCH_SIZE
            batch = chunks[start:end]
            print("Calculating embedding for batch #", x)
            response = client.embeddings.create(
                model=EMBEDDING_MODEL,
                input=batch,
                api_key = os.getenv("OPENAI_API_KEY")
            )

            for i, embedding in enumerate(response["data"]):
                assert i == embedding["index"]

            batch_embeddings = [e["embedding"] for e in response["data"]]
            embeddings.extend(batch_embeddings)
            
        results[path] = embeddings
        
    return results