In [1]:
import os
from dotenv import load_dotenv
import pymongo
from tqdm import tqdm

# Load environment variables from a .env file
load_dotenv()

True

In [2]:
# Retrieve environment variables
MONGODB_CONN_URI = os.getenv('MONGODB_CONN_URI')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

# Verify that both environment variables are set
if MONGODB_CONN_URI and GITHUB_TOKEN:
    print("Loaded MongoDB connection url and OpenAI API Key")
else:
    print("Error: Missing MongoDB connection URL or OpenAI API Key.")

Loaded MongoDB connection url and OpenAI API Key


In [3]:
# Create a MongoDB client instance
try:
    client = pymongo.MongoClient(MONGODB_CONN_URI)
except pymongo.errors.ConnectionError as e:
    print(f"Error connecting to MongoDB: {e}")

In [5]:
# Define database, collection, and index names
DB_NAME = 'arb_knowledgebase'
COLLECTION_NAME = 'docs'
INDEX_NAME = 'idx_embedding'

In [7]:
from langchain_openai import OpenAIEmbeddings
from llama_index.core import Settings

embed_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    base_url="https://models.inference.ai.azure.com",
    api_key=GITHUB_TOKEN
)

# Set the embedding model in the Settings configuration
Settings.embed_model = embed_model

In [8]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import StorageContext

# Initialize the vector store with MongoDB Atlas
vector_store = MongoDBAtlasVectorSearch(
    mongodb_client=client,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
    vector_index_name=INDEX_NAME
)

# Create a storage context with the initialized vector store
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index.core import SimpleDirectoryReader

# Define the directory containing the data
data_dir = 'data'

# Load data from the specified directory
docs = SimpleDirectoryReader(
    input_dir=data_dir,
    recursive=False
).load_data()

# Output the number of loaded chunks
print(f"Loaded {len(docs)} chunks from '{data_dir}'")

Loaded 1 chunks from 'data'


In [12]:
from llama_index.core import VectorStoreIndex

# Create a MongoDB index from the loaded documents
index = VectorStoreIndex.from_documents(
    documents=docs,
    storage_context=storage_context,
    embed_model=embed_model
)