<a href="https://colab.research.google.com/github/rawkintrevo/sme-seeks/blob/main/notebooks/Building_qMahout_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install -q llama-index==0.9.26 GitPython nbformat nbconvert "pinecone-client[grpc]"
# the version lock on llama-index is only bc the current it busted

# Clone Qiskit API Docs and Delete Prior Versions

In [2]:
git_targets = [
    # ("https://github.com/<org>/<repo>.git", "Title", "path/to/docs")
    ("https://github.com/Qiskit/documentation.git", "Qiskit Documentation", "docs", "api/qiskit"),
    ("https://github.com/qiskit-community/qiskit-textbook.git", "Qiskit Textbook", "content", None),
    ("https://github.com/qiskit-community/qiskit-community-tutorials.git", "Qiskit Community Tutorials", "", None),
    ("https://github.com/qiskit-community/qiskit-machine-learning.git", "Qiskit Machine Learning", "docs", None),
]

INDEX_NAME = "qmahout"

In [3]:

import os
import shutil

def delete_subdirectories(directory):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isdir(item_path):
            shutil.rmtree(item_path)

In [4]:
from git import Repo
import os
from datetime import datetime

from llama_index.readers.schema.base import Document

from datetime import datetime

import nbformat
from nbconvert import MarkdownExporter

documents = []

for url, name, path, delete_subdirectories_path in git_targets:
    print(f"Cloning {url} to {name}")
    Repo.clone_from(url, name)
    if delete_subdirectories_path is not None:
      delete_subdirectories(f"./{name}/{path}/{delete_subdirectories_path}")
      print(f"Subdirectories in '{delete_subdirectories_path}' have been deleted.")
    for root, dirs, files in os.walk(f"./{name}/{path}"):
        for file_name in files:
            if os.path.islink(file_name):
                continue
            file_path = os.path.join(root, file_name)
            title = name
            if ".md" in file_name: # if '.md' in ... include md AND mdx
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = file.read()
                lines = data.splitlines()
                for line in lines:
                    if line.startswith("title:"):
                        title += line.replace("title:", " - ")
                        break
                if title == name:
                  title += " - " + file_path.split(name+'/')[1]
            elif ".ipynb" in file_name:
                with open(file_path, 'r') as notebook_file:
                    notebook_content = nbformat.read(notebook_file, as_version=4)

                # Create a MarkdownExporter instance
                markdown_exporter = MarkdownExporter()

                # Convert the Notebook to Markdown
                data, _ = markdown_exporter.from_notebook_node(notebook_content)
                title += " - " + file_path.split(name+'/')[1]
            else:
              # someother unsupported filetype- just skip
              continue
            documents.append(Document(text=data,
                                      metadata={"date":
                                                datetime.now().strftime("%Y-%m-%d"),
                                                "src": f"{url.replace('.git', '')}/blob/main/{file_path.split(name+'/')[1]}",
                                                "title": title}))
    print(f"{len(documents)} documents found so far.")

Cloning https://github.com/Qiskit/documentation.git to Qiskit Documentation
Subdirectories in 'api/qiskit' have been deleted.


  validate(nb)


994 documents found so far.
Cloning https://github.com/qiskit-community/qiskit-textbook.git to Qiskit Textbook


  validate(nb)


1093 documents found so far.
Cloning https://github.com/qiskit-community/qiskit-community-tutorials.git to Qiskit Community Tutorials


  validate(nb)


1397 documents found so far.
Cloning https://github.com/qiskit-community/qiskit-machine-learning.git to Qiskit Machine Learning
1409 documents found so far.


In [5]:
# create and reload with new docs
from llama_index.storage.storage_context import StorageContext
from llama_index import VectorStoreIndex
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import PineconeVectorStore
import pinecone
import openai

from time import sleep

from google.colab import userdata

openai.api_key = userdata.get('open_ai_key') #required for making embeddings- work arounds exist, they aren't great- pay the $3 and have it done right.
pinecone.init(api_key=userdata.get("pinecone_api_key_quantum") , environment="gcp-starter")

print(f"Deleting index {INDEX_NAME}...")
pinecone.delete_index(INDEX_NAME)
print('take a little nappy-poo')
sleep(5)
print(f"Creating index {INDEX_NAME}...")
pinecone.create_index(
    INDEX_NAME, dimension=1536, metric="euclidean", pod_type="p1"
)

pinecone_index = pinecone.Index(INDEX_NAME)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
# Parse documents into nodes
print(f"Parsing {len(documents)} documents into nodes...")
parser = SimpleNodeParser()
new_nodes = parser.get_nodes_from_documents(documents, chunk_size=1030)
# Add nodes to the existing index
print(f"Adding {len(new_nodes)} new nodes to the existing index {INDEX_NAME}...")
index.insert_nodes(new_nodes)

  from tqdm.autonotebook import tqdm


Deleting index qmahout...
take a little nappy-poo
Creating index qmahout...
Parsing 1409 documents into nodes...
Adding 5119 new nodes to the existing index qmahout...


Upserted vectors:   0%|          | 0/2048 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/2048 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/1023 [00:00<?, ?it/s]