# Chunks

In [1]:
import csv
import datetime
import openai
import math
import sys
import tiktoken

from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity

In [2]:
print('Today is:', datetime.datetime.today().strftime ('%d-%b-%Y %H:%M:%S'))

Today is: 21-Sep-2023 08:30:52


In [3]:
load_dotenv("azure.env")

# Azure Open AI
openai.api_type: str = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")

print("Open AI version:", openai.__version__)

Open AI version: 0.28.0


In [4]:
sys.version

'3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]'

In [5]:
def split_text(text, chunk_size):
    """
    Split text
    """
    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]

In [6]:
def get_embeddings(text_chunks):
    """
    Create embeddings
    """
    try:
        prepared_chunks = [chunk.replace("\n", " ") for chunk in text_chunks]

        response = openai.Embedding.create(
            input=prepared_chunks,
            deployment_id="text-embedding-ada-002",
        )

        if response and "data" in response:
            embeddings = [data["embedding"] for data in response["data"]]
            return embeddings

    except Exception as e:
        print(f"Error creating embeddings: {e}")

    return []

In [7]:
def export_embeddings_to_csv(embeddings, csv_filename):
    """
    Export embeddings to csv
    """
    with open(csv_filename, "w", newline="") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerows(embeddings)
    print("Embeddings have been saved in:", csv_filename)

In [8]:
def import_embeddings_from_csv(csv_filename):
    """
    Read embeddings from csv
    """
    embeddings = []

    if os.path.exists(csv_filename):
        with open(csv_filename, "r", newline="") as csv_file:
            csv_reader = csv.reader(csv_file)
            embeddings = [[float(value) for value in row] for row in csv_reader]
        return embeddings
    else:
        print("Error. File does not exist.")
        return None

In [9]:
def export_chunks_to_csv(chunks, csv_filename):
    """
    Write chunks to a csv file
    """
    with open(csv_filename, "w", encoding="utf-8", newline="") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["chunk"])
        writer.writerows([[chunk] for chunk in chunks])
        print("Chunks have been saved in:", csv_filename)

In [10]:
def import_chunks_from_csv(csv_filename):
    """
    Read chunks from csv file
    """
    chunks = []

    if os.path.exists(csv_filename):
        with open(csv_filename, "r", encoding="utf-8", newline="") as csv_file:
            reader = csv.reader(csv_file)
            next(reader)  # Skip header row
            for row in reader:
                chunks.append(row[0])
        return chunks
    else:
        print("Error. File does not exist.")
        return None

## Document

In [11]:
DOC_FILE = "document.txt"

In [12]:
!ls $DOC_FILE -lh

-rwxrwxrwx 1 root root 78K Sep 13 10:45 document.txt


In [13]:
with open(DOC_FILE, "r") as f:
    mytext = f.read()

In [14]:
len(mytext)

78458

In [15]:
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("text-embedding-ada-002")
print(f"{len(encoding.encode(mytext))} tokens")

16085 tokens


## Generating chunks

In [16]:
text_chunks = split_text(mytext, 20000)
len(text_chunks)

4

In [17]:
myembeddings = get_embeddings(text_chunks)

In [18]:
len(myembeddings)

4

In [19]:
len(myembeddings[0])

1536

In [20]:
myembeddings[0][:10]

[-0.00039046857273206115,
 0.018460722640156746,
 -0.0016234100330621004,
 -0.027231184765696526,
 -0.01879754848778248,
 -0.0011473177000880241,
 -0.002461429685354233,
 -0.012546813115477562,
 -0.007805322762578726,
 -0.01094040647149086]

In [21]:
for i in range(len(text_chunks)):
    print(f"Chunk {i+1} = {len(encoding.encode(text_chunks[i]))} tokens")

Chunk 1 = 4033 tokens
Chunk 2 = 3964 tokens
Chunk 3 = 4091 tokens
Chunk 4 = 3999 tokens


## Saving embeddings

In [22]:
embeddings_file = "embeddings.csv"

In [23]:
export_embeddings_to_csv(myembeddings, embeddings_file)

Embeddings have been saved in: embeddings.csv


In [24]:
!ls $embeddings_file -lh

-rwxrwxrwx 1 root root 129K Sep 21 08:30 embeddings.csv


## Saving chunks

In [25]:
chunks_file = "chunks.csv"

In [26]:
export_chunks_to_csv(text_chunks, chunks_file)

Chunks have been saved in: chunks.csv


In [27]:
!ls $chunks_file -lh

-rwxrwxrwx 1 root root 77K Sep 21 08:30 chunks.csv


## Importing chunks from a file

In [28]:
mychunks = import_chunks_from_csv(chunks_file)

In [29]:
len(mychunks)

4

## Importing embeddings from a file

In [30]:
myemb = import_embeddings_from_csv(embeddings_file)