In [None]:
from util import load_config, set_embedding
from dotenv import dotenv_values
from ingestion import IngestionEngine
import os

env_values = dotenv_values(dotenv_path="../.env")
config_file = "../ingestion_config.toml"
config = load_config(config_file=config_file)

os.environ["PINECONE_API_KEY"] = env_values["PINECONE_API_KEY"]
os.environ["GITHUB_TOKEN"] = env_values["GITHUB_TOKEN"]

In [None]:
from pinecone import Pinecone
from llama_index.core import Settings

pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

set_embedding(embed_model_name=config["embed_model"])

ingestion_engine = IngestionEngine(
    pinecone_client=pinecone_client,
    dimension=config["dimension"],
    splitting=config["splitting"],
    extractors=[]
)

In [None]:
if pinecone_client.list_indexes().names():
    print("Vector Database is not empty.")

In [None]:
print("Index data into 'tech-docs'.")
docs = ingestion_engine.docs_from_urls(urls=config["urls"])

for d in docs:
    d.metadata["index_name"] = "tech-docs"

ingestion_engine.index_documents(
    index_name="tech-docs",
    documents=docs,
    delete_index=True
)

In [None]:
print("Index data into 'so-posts'.")
docs = ingestion_engine.docs_from_dir(data_dir=config["data_dir"])

for d in docs:
    d.metadata["index_name"] = "so-posts"

ingestion_engine.index_documents(
    index_name="so-posts",
    documents=docs,
        delete_index=True
)

In [None]:
print(os.getenv(key="GITHUB_TOKEN"))

print("Index data into 'github'.")
docs = []
for project_name in config["github"]:
    docs += ingestion_engine.docs_from_github(project_name=project_name)

for d in docs:
    d.metadata["index_name"] = "github"

ingestion_engine.index_documents(
    index_name="github",
    documents=docs,
    delete_index=True
)