In [None]:
# Imports

import os
import nltk
import json
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from dotenv import load_dotenv
from gensim.models import Word2Vec
from openai import OpenAI
from psycopg2 import connect
from psycopg2.extras import RealDictCursor
from pgvector.psycopg2 import register_vector

In [None]:
QUERY_TEXT = "Who was Quentin Beck?"

In [None]:
# Download NLTK data files

nltk.download('punkt', download_dir='./nltk')
nltk.download('stopwords', download_dir='./nltk')

In [None]:
# Declare variables

sample_file_path = "samples"

In [None]:
# Read files and create tokens

stop_words = stopwords.words('english')

doc_tokens = list()
doc_content = list()
doc_files = list()
for sample_file in os.listdir("./" + sample_file_path):
    with open(f"./{sample_file_path}/{sample_file}", 'r', encoding='utf-8') as f:
        content = f.read()
        doc_content.append(content)
        tokens = word_tokenize(content)
        doc_tokens.append([word.lower() for word in tokens if word not in stop_words])
        doc_files.append(sample_file)

In [None]:
# Train model

model = Word2Vec(sentences=doc_tokens, min_count=1, window=5, workers=4, vector_size=384)
print(model)
model.train(doc_tokens, total_examples=len(doc_tokens), epochs=10)

`Word2Vec`

**sentences:** The list of sentences split into words in lowercase.

**min_count:** Which words to consider in accordance to the number of times they appear in the sentences. For example, if set to 1, that means all the words that occur once or more in all of the sentences will be used to create the embeddings. If set to 2, then all the words that occur twice or more will be created embeddings for.

**window:** The maximum distance between the current and predicted word within a sentence. That is, how many words to the left and right of a given word are considered when training the model.

**workers:** How many CPU cores will be used.

**vector_size:** Dimension of the vectors. Set as 384 because that is the Chroma DB default.

In [None]:
# Create document embeddings

doc_embeddings = list()
for doc_token in doc_tokens:
    valid_tokens = [token for token in doc_token if token in model.wv]
    if not valid_tokens:
        vector = np.zeros(model.vector_size)
    else:
        vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
    doc_embeddings.append([float(value) for value in vector])

In [None]:
# Load environment variables

load_dotenv()

OPENAI_ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
OPENAI_PROJECT_ID = os.getenv('OPENAI_PROJECT_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USERNAME = os.getenv('DB_USERNAME')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT')

In [None]:
# Instantiate Postgres client

pg_config = {
    'host': DB_HOST,
    'database': DB_NAME,
    'user': DB_USERNAME,
    'password': DB_PASSWORD,
    'port': DB_PORT
}

pg_client = connect(**pg_config)
cursor = pg_client.cursor()
register_vector(pg_client)

In [None]:
# Create database table

create_table_query = """
CREATE TABLE IF NOT EXISTS embeddings (
            id SERIAL PRIMARY KEY, 
            document VARCHAR(200),
            content text,
            embedding vector(384)
            );
            """
cursor.execute(create_table_query)
pg_client.commit()

In [None]:
# Populate table

for i in range(len(doc_content)):
    temp = (doc_files[i], doc_content[i], doc_embeddings[i])
    cursor.execute("INSERT into embeddings(document, content, embedding) VALUES (%s, %s, %s)", temp)

print("Insertion Complete")
pg_client.commit()
cursor.close()

In [None]:
# Embed user query

query_tokens = word_tokenize(QUERY_TEXT)
query_tokens = [word.lower() for word in query_tokens if word not in stop_words]

query_model = Word2Vec(sentences=query_tokens, min_count=1, window=5, workers=4, vector_size=384)
print(query_model)
query_model.train(query_tokens, total_examples=len(query_tokens), epochs=10)

valid_tokens = [token for token in query_tokens if token in query_model.wv]
if not valid_tokens:
    query_embeddings = np.zeros(query_model.vector_size)
else:
    query_embeddings = np.mean([query_model.wv[token] for token in valid_tokens], axis=0)

query_embeddings = list(map(float, query_embeddings))

In [None]:
# Retrieve content from DB

with pg_client.cursor(cursor_factory=RealDictCursor) as cursor:
    cursor.execute("SELECT content FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 2", (query_embeddings,))
    resultant_contents = cursor.fetchall()
    cursor.close()
pg_client.close()

print(resultant_contents)

In [None]:
# Instantiate OpenAI client

openai = OpenAI(
    api_key=OPENAI_API_KEY,
    organization=OPENAI_ORGANIZATION_ID,
    project=OPENAI_PROJECT_ID
)

In [None]:
# Making OpenAI request

tools = list()
for resultant_content in resultant_contents:
    temp = {
      "type": "function",
      "function": {
        "name": "information",
        "description": "the chat information",
        "parameters": {
          "type": "object",
          "properties": {
            "text": {
              "type": "string",
              "description": resultant_content.get('content'),
            },
          },
          "required": ["text"],
        },
      }
    }
    tools.append(temp)

messages = [
    {
        "role": "system",
        "content": "You are an assistant."
    },
    {
        "role": "user",
        "content": QUERY_TEXT
    }
]

response = openai.chat.completions.create(
    messages=messages,
    model="gpt-3.5-turbo",
    tools=tools,
    tool_choice={"type": "function", "function": {"name": "information"}}
)

In [None]:
# Retrieving OpenAI response

result = response.choices[0].message.tool_calls[0].function.arguments
result_json = json.loads(result)
print(result_json.get('text'))
