In [24]:
# Imports

import os
import nltk
import json
import numpy as np
import requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from dotenv import load_dotenv
from gensim.models import Word2Vec
from openai import OpenAI
from psycopg2 import connect
from cloudflare import Cloudflare
from io import BytesIO

In [2]:
QUERY_TEXT = "Who was Quentin Beck?"

In [3]:
# Download NLTK data files

nltk.download('punkt', download_dir='./nltk')
nltk.download('stopwords', download_dir='./nltk')

[nltk_data] Downloading package punkt to ./nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Declare variables

sample_file_path = "samples"
index_name = "rag-test"

In [5]:
# Read files and create tokens

stop_words = stopwords.words('english')

doc_tokens = list()
doc_content = list()
doc_files = list()
for sample_file in os.listdir("./" + sample_file_path):
    with open(f"./{sample_file_path}/{sample_file}", 'r', encoding='utf-8') as f:
        content = f.read()
        doc_content.append(content)
        tokens = word_tokenize(content)
        doc_tokens.append([word.lower() for word in tokens if word not in stop_words])
        doc_files.append(sample_file)

In [6]:
# Train model

model = Word2Vec(sentences=doc_tokens, min_count=1, window=5, workers=4, vector_size=384)
print(model)
model.train(doc_tokens, total_examples=len(doc_tokens), epochs=10)

Word2Vec<vocab=681, vector_size=384, alpha=0.025>


(11172, 15060)

`Word2Vec`

**sentences:** The list of sentences split into words in lowercase.

**min_count:** Which words to consider in accordance to the number of times they appear in the sentences. For example, if set to 1, that means all the words that occur once or more in all of the sentences will be used to create the embeddings. If set to 2, then all the words that occur twice or more will be created embeddings for.

**window:** The maximum distance between the current and predicted word within a sentence. That is, how many words to the left and right of a given word are considered when training the model.

**workers:** How many CPU cores will be used.

**vector_size:** Dimension of the vectors. Set as 384 because that is the Chroma DB default.

In [41]:
# Create document embeddings

doc_embeddings = list()
for doc_token in doc_tokens:
    valid_tokens = [token for token in doc_token if token in model.wv]
    if not valid_tokens:
        vector = np.zeros(model.vector_size)
    else:
        vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
    doc_embeddings.append([float(value) for value in vector])

In [8]:
# Load environment variables

load_dotenv()

OPENAI_ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
OPENAI_PROJECT_ID = os.getenv('OPENAI_PROJECT_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

CLOUDFLARE_TOKEN = os.getenv('CLOUDFLARE_TOKEN')
CLOUDFLARE_ACCOUNT_IDENTIFIER = os.getenv('CLOUDFLARE_ACCOUNT_IDENTIFIER')

DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USERNAME = os.getenv('DB_USERNAME')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT')

In [9]:
# Instantiate Postgres client

pg_config = {
    'host': DB_HOST,
    'database': DB_NAME,
    'user': DB_USERNAME,
    'password': DB_PASSWORD,
    'port': DB_PORT
}

pg_client = connect(**pg_config)
cursor = pg_client.cursor()

In [10]:
# Create database table

create_table_query = """
CREATE TABLE IF NOT EXISTS embedding_docs (
            id SERIAL PRIMARY KEY, 
            document VARCHAR(200),
            content text
            );
            """
cursor.execute(create_table_query)
pg_client.commit()

In [11]:
# Populate table

doc_ids = list()
for i in range(len(doc_content)):
    temp = (doc_files[i], doc_content[i])
    cursor.execute("INSERT into embedding_docs(document, content) VALUES (%s, %s) RETURNING id", temp)
    obj_id = cursor.fetchone()[0]
    doc_ids.append(obj_id)

print("Insertion Complete")
pg_client.commit()
cursor.close()

Insertion Complete


In [17]:
# Create Vectorize index

url = f"https://api.cloudflare.com/client/v4/accounts/{CLOUDFLARE_ACCOUNT_IDENTIFIER}/vectorize/indexes"

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {CLOUDFLARE_TOKEN}"
}

payload = {
    "config": {
        "dimensions": 384,
        "metric": "cosine"
    },
    "description": "RAG test vectorize index",
    "name": index_name
}

response = requests.post(url=url, headers=headers, json=payload)

# Delete Vectorize index
# url = f"https://api.cloudflare.com/client/v4/accounts/{CLOUDFLARE_ACCOUNT_IDENTIFIER}/vectorize/indexes/{index_name}"
# response = requests.delete(url=url, headers=headers)

print(response.status_code)
print(response.text)

201
{
  "result": {
    "created_on": "2024-06-25T05:41:35.711043Z",
    "modified_on": "2024-06-25T05:41:35.711043Z",
    "name": "rag-test",
    "description": "RAG test vectorize index",
    "config": {
      "dimensions": 384,
      "metric": "cosine"
    }
  },
  "success": true,
  "errors": [],
  "messages": []
}



In [42]:
# Vector upsert

vectors = list()
for idx, doc_embedding in enumerate(doc_embeddings):
    vectors.append({
        "id": str(doc_ids[idx]),
        "values": doc_embedding
    })

# url = f"https://api.cloudflare.com/client/v4/accounts/{CLOUDFLARE_ACCOUNT_IDENTIFIER}/vectorize/indexes/{index_name}/upsert"

# filename = 'vectors.ndjson'
# with open(filename, 'w') as file:
#     for vector in vectors:
#         json.dump(vector, file)
#         file.write('\n')

ndjson_str = "\n".join(json.dumps(vector) for vector in vectors)

# headers = {
#     "Content-Type": "application/x-ndjson",
#     "Authorization": f"Bearer {CLOUDFLARE_TOKEN}"
# }

# response = requests.post(url=url, data=filename, headers=headers)

# print(response.status_code)
# print(response.text)

# ndjson_file = BytesIO()
# for vector in vectors:
#     ndjson_file.write(json.dumps(vector).encode('utf-8'))
#     ndjson_file.write(b'\n')

# ndjson_file.seek(0)

cf_client = Cloudflare(api_token=CLOUDFLARE_TOKEN)
result = cf_client.vectorize.indexes.upsert(
    index_name=index_name,
    account_id=CLOUDFLARE_ACCOUNT_IDENTIFIER,
    body=ndjson_str,
    extra_headers={"Content-Type": "application/x-ndjson"}
)
print(result.model_dump())

BadRequestError: Error code: 400 - {'result': None, 'success': False, 'errors': [{'code': 1006, 'message': 'vectorize.invalid_body_payload'}], 'messages': []}

In [None]:
# Embed user query

query_tokens = word_tokenize(QUERY_TEXT)
query_tokens = [word.lower() for word in query_tokens if word not in stop_words]

query_model = Word2Vec(sentences=query_tokens, min_count=1, window=5, workers=4, vector_size=384)
print(query_model)
query_model.train(query_tokens, total_examples=len(query_tokens), epochs=10)

valid_tokens = [token for token in query_tokens if token in query_model.wv]
if not valid_tokens:
    query_embeddings = np.zeros(query_model.vector_size)
else:
    query_embeddings = np.mean([query_model.wv[token] for token in valid_tokens], axis=0)

query_embeddings = list(map(float, query_embeddings))

In [None]:
# Query vectors

url = f"https://api.cloudflare.com/client/v4/accounts/{CLOUDFLARE_ACCOUNT_IDENTIFIER}/vectorize/indexes/{index_name}/query"

payload = {"vector": query_embeddings}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {CLOUDFLARE_TOKEN}"
}

response = requests.post(url=url, headers=headers, json=payload)

print(response.status_code)
print(response.text)

In [None]:
# Instantiate OpenAI client

openai = OpenAI(
    api_key=OPENAI_API_KEY,
    organization=OPENAI_ORGANIZATION_ID,
    project=OPENAI_PROJECT_ID
)

In [None]:
# Making OpenAI request

tools = list()
for resultant_content in resultant_contents:
    temp = {
      "type": "function",
      "function": {
        "name": "information",
        "description": "the chat information",
        "parameters": {
          "type": "object",
          "properties": {
            "text": {
              "type": "string",
              "description": resultant_content.get('content'),
            },
          },
          "required": ["text"],
        },
      }
    }
    tools.append(temp)

messages = [
    {
        "role": "system",
        "content": "You are an assistant."
    },
    {
        "role": "user",
        "content": QUERY_TEXT
    }
]

response = openai.chat.completions.create(
    messages=messages,
    model="gpt-3.5-turbo",
    tools=tools,
    tool_choice={"type": "function", "function": {"name": "information"}}
)

In [None]:
# Retrieving OpenAI response

result = response.choices[0].message.tool_calls[0].function.arguments
result_json = json.loads(result)
print(result_json.get('text'))
