In [49]:
# Imports

import os
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from dotenv import load_dotenv
from gensim.models import Word2Vec
from chromadb import PersistentClient

In [50]:
# Download NLTK data files

nltk.download('punkt', download_dir='./nltk')
nltk.download('stopwords', download_dir='./nltk')

[nltk_data] Downloading package punkt to ./nltk...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
# Declare variables

sample_file_path = "samples"
model_name = "sample.model"

In [52]:
# Read files and create tokens

stop_words = stopwords.words('english')

doc_tokens = list()
doc_content = list()
for sample_file in os.listdir("./" + sample_file_path):
    with open(f"./{sample_file_path}/{sample_file}", 'r', encoding='utf-8') as f:
        content = f.read()
        doc_content.append(content)
        tokens = word_tokenize(content)
        doc_tokens.append([word.lower() for word in tokens if word not in stop_words])

In [53]:
# Train model

model = Word2Vec(sentences=doc_tokens, min_count=1, window=5, workers=4)
print(model)
model.train(doc_tokens, total_examples=len(doc_tokens), epochs=10)

Word2Vec<vocab=681, vector_size=100, alpha=0.025>


(11123, 15060)

`Word2Vec`

**sentences:** The list of sentences split into words in lowercase.

**min_count:** Which words to consider in accordance to the number of times they appear in the sentences. For example, if set to 1, that means all the words that occur once or more in all of the sentences will be used to create the embeddings. If set to 2, then all the words that occur twice or more will be created embeddings for.

**window:** The maximum distance between the current and predicted word within a sentence. That is, how many words to the left and right of a given word are considered when training the model.

**workers:** How many CPU cores will be used.

In [54]:
# Create document embeddings

doc_embeddings = list()
for doc_token in doc_tokens:
    valid_tokens = [token for token in doc_token if token in model.wv]
    if not valid_tokens:
        vector = np.zeros(model.vector_size)
    else:
        vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
    doc_embeddings.append([float(value) for value in vector])

In [55]:
# Insantiate Chroma DB client

chroma_client = PersistentClient()

In [56]:
# Create and populate collection

collection = chroma_client.get_or_create_collection(name="document_embeddings")

for idx, embedding in enumerate(doc_embeddings):
    collection.upsert(
        ids=["doc1", "doc2", "doc3"],
        embeddings=doc_embeddings,
        documents=doc_content
    )

In [57]:
# Load environment variables

load_dotenv()

OPENAI_ORGANIZATION_ID = os.getenv('OPENAI_ORGANIZATION_ID')
OPENAI_PROJECT_ID = os.getenv('OPENAI_PROJECT_ID')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')