In [1]:
from bs4 import BeautifulSoup
import io
import tiktoken
import os
import pandas as pd
import numpy as np
from openai import OpenAI

client = OpenAI()

In [2]:
def extract_text(filename):
    f = io.open(filename, mode="r", encoding="utf-8")
    html_doc = f.read()
    f.close()
    soup = BeautifulSoup(html_doc, 'html.parser')
    text = soup.get_text()
    return text

In [3]:
def chunking(text, max_token_count):
    enc = tiktoken.get_encoding("cl100k_base")

    words = text.split()
    chunk = []
    encodeStr = ""
    token_count = 0
    
    
    for word in words:
        if token_count+len(enc.encode(word))>max_token_count:
            chunk.append(encodeStr.strip())
            encodeStr = ""
            token_count = 0
        else:
            encodeStr = encodeStr + " " + word
            token_count += len(enc.encode(word))
    
    if encodeStr:        
        chunk.append(encodeStr.strip())
        
    return chunk

In [4]:
def extract_and_chunk_text(directory, max_token_count):
    results = {}
    
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.html'):
                file_path = os.path.join(dirpath, filename)
                # print("filename: ", filename)
                text = extract_text(file_path)
                # print(text)
                chunks = chunking(text, max_token_count)
                results[file_path] = chunks
                
                
    return results

In [5]:
def calculate_embeddings_of_chunk_batch(batch_chunk,EMBEDDING_MODEL):
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=batch_chunk
    )

    for i, embedding in enumerate(response["data"]):
        assert i == embedding["index"]

    return [e["embedding"] for e in response["data"]]

In [6]:
def calculate_embeddings_for_dict(chunks_dict):
    """
    Calculate embeddings for a dictionary where each key is a file path and the corresponding value is a list of text chunks.
    
    Parameters:
    - chunks_dict (dict): Dictionary with file paths as keys and lists of text chunks as values.
    
    Returns:
    - Dictionary with file paths as keys and lists of embeddings as values.
    """

    EMBEDDING_MODEL = "text-embedding-ada-002"
    BATCH_SIZE = 20000
    results = {}
    
    for path, chunks in chunks_dict.items():
        embeddings = []
        for x in range(BATCH_SIZE):
            start = x * BATCH_SIZE
            end = start + BATCH_SIZE
            batch = chunks[start:end]
            print("Calculating embedding for batch #", x)
            embeddings.extend(calculate_embeddings_of_chunk_batch(batch,EMBEDDING_MODEL))
            
        results[path] = embeddings
        
    return results

In [7]:
def get_embeddings_for_chunks(chunks):
    """
    Calculate embeddings for a list of text chunks.
    
    Parameters:
    - chunks (list): List of text chunks.
    
    Returns:
    - List of embeddings for each chunk.
    """

    EMBEDDING_MODEL = "text-embedding-ada-002"
    BATCH_SIZE = 20000
    results = []
    
    for chunk in chunks:
        embeddings = []
        for x in range(0,len(chunks),BATCH_SIZE):
            start = x
            end = start + BATCH_SIZE
            batch = chunk[start:end]
            print("Calculating embedding for batch #", x)
            embeddings.extend(calculate_embeddings_of_chunk_batch(batch,EMBEDDING_MODEL))
        results.append(embeddings)
    return results

In [8]:
def process_directory_and_get_embeddings(directory, max_tokens=4096):
    all_chunks = []
    file_paths = []
    
    file_to_chunks = extract_and_chunk_text(directory, max_tokens)
    file_paths = list(file_to_chunks.keys())
    all_chunks = list(file_to_chunks.values())
    print(file_paths)
    print(all_chunks)
    
    embeddings = get_embeddings_for_chunks(all_chunks)
    
    return pd.DataFrame({'file_path':file_paths, 'text':all_chunks, 'embeddings': embeddings})

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_embedding(text, model="text-embedding-ada-002"): 
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
def search_similar_strings(df, user_query, limit=100):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002"
    )
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(limit)
    )
    return res["text"], res["similarities"] 

In [None]:
# Placeholder constants for models
GPT_MODEL = "gpt-3.5-turbo"

# Partial function to count number of tokens
def num_tokens(text, model=GPT_MODEL):
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(text))
    return num_tokens

In [None]:
# Partial function to construct a query message
def query_message(query, df, model, token_budget):
    strings, relatednesses = search_similar_strings(df, query, token_budget)

    intro = "Using the provided documentation on LangChain, answer the following question (if the answer cannot be found in the documents, write 'I could not find an answer.').\n"
    question = f"Question: {query}\n"
    documentation = f"Documentation: {strings}"
    message = intro + question + documentation

    return message