In [69]:
# Imports

import os
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from dotenv import load_dotenv
from gensim.models import Word2Vec
from chromadb import Client
from openai import OpenAI

In [70]:
QUERY_TEXT = "What was Adrian Toomes' objective?"

In [71]:
# Download NLTK data files

nltk.download('punkt', download_dir='./nltk')
nltk.download('stopwords', download_dir='./nltk')

[nltk_data] Downloading package punkt to ./nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
# Declare variables

sample_file_path = "samples"
model_name = "sample.model"

In [73]:
# Read files and create tokens

stop_words = stopwords.words('english')

doc_tokens = list()
doc_content = list()
for sample_file in os.listdir("./" + sample_file_path):
    with open(f"./{sample_file_path}/{sample_file}", 'r', encoding='utf-8') as f:
        content = f.read()
        doc_content.append(content)
        tokens = word_tokenize(content)
        doc_tokens.append([word.lower() for word in tokens if word not in stop_words])

In [74]:
# Train model

model = Word2Vec(sentences=doc_tokens, min_count=1, window=5, workers=4, vector_size=384)
print(model)
model.train(doc_tokens, total_examples=len(doc_tokens), epochs=10)

Word2Vec<vocab=681, vector_size=384, alpha=0.025>


(11123, 15060)

`Word2Vec`

**sentences:** The list of sentences split into words in lowercase.

**min_count:** Which words to consider in accordance to the number of times they appear in the sentences. For example, if set to 1, that means all the words that occur once or more in all of the sentences will be used to create the embeddings. If set to 2, then all the words that occur twice or more will be created embeddings for.

**window:** The maximum distance between the current and predicted word within a sentence. That is, how many words to the left and right of a given word are considered when training the model.

**workers:** How many CPU cores will be used.

**vector_size:** Dimension of the vectors. Set as 384 because that is the Chroma DB default.

In [75]:
# Create document embeddings

doc_embeddings = list()
for doc_token in doc_tokens:
    valid_tokens = [token for token in doc_token if token in model.wv]
    if not valid_tokens:
        vector = np.zeros(model.vector_size)
    else:
        vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
    doc_embeddings.append([float(value) for value in vector])

In [76]:
# Instantiate Chroma DB client

chroma_client = Client()
print(chroma_client.database)
# chroma_client.delete_collection("document_embeddings")

default_database


In [77]:
# Create and populate collection

collection = chroma_client.get_or_create_collection(name="document_embeddings")
collection.upsert(
    ids=["doc1", "doc2", "doc3"],
    embeddings=doc_embeddings,
    documents=doc_content
)

In [78]:
# Query the collection

query_result = collection.query(
    query_texts=[QUERY_TEXT],
    n_results=2
)
print(query_result)

{'ids': [['doc3', 'doc2']], 'distances': [[1.0010535717010498, 1.0010854005813599]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['After Quentin Beck frames Peter Parker for his murder and reveals that Peter is Spider-Man, the Department of Damage Control interrogates Peter; his girlfriend, Michelle "MJ" Jones-Watson; his best friend, Ned Leeds; and his aunt, May Parker. Lawyer Matt Murdock gets Peter\'s charges dropped, but the group grapples with negative publicity. After Peter\'s, MJ\'s, and Ned\'s MIT applications are rejected, Peter goes to the New York Sanctum to ask Dr. Stephen Strange for help. Strange starts casting a spell that would make everyone forget Peter is Spider-Man, but it is corrupted when Peter repeatedly requests alterations to let his loved ones retain their memories. Strange contains the corrupted spell.\nAt Strange\'s suggestion, Peter tries to convince an MIT administrator to reconsider MJ\'s and Ned\'s applications. He is attacked by Otto O

In [79]:
# Load environment variables

load_dotenv()

OPENAI_ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
OPENAI_PROJECT_ID = os.getenv('OPENAI_PROJECT_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [80]:
# Instantiate OpenAI client

openai = OpenAI(
    api_key=OPENAI_API_KEY,
    organization=OPENAI_ORGANIZATION_ID,
    project=OPENAI_PROJECT_ID
)

In [81]:
# Making and retrieving OpenAI response

tools = list()
for document in query_result["documents"]:
    temp = {
      "type": "function",
      "function": {
        "name": "information",
        "description": "the chat information",
        "parameters": {
          "type": "object",
          "properties": {
            "text": {
              "type": "string",
              "description": document,
            },
          },
          "required": ["text"],
        },
      }
    }
    tools.append(temp)

messages = [
    {
        "role": "system",
        "content": "You are an assistant."
    },
    {
        "role": "user",
        "content": QUERY_TEXT
    }
]

response = openai.chat.completions.create(
    messages=messages,
    model="gpt-3.5-turbo",
    tools=tools,
    tool_choice="required"
)

print(response.json())

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}