In [18]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import dotenv_values

document_types = {
    "DOCUMENTATIONS": {
        "index_name": "tech-docs"
    },
    "BLOG_POSTS": {
        "index_name": "blog-posts",
        "data_dir": "../../data/blog_posts"
    },
    "STACK_OVERFLOW_POSTS": {
        "index_name": "so-posts",
        "data_dir": "../../data/so_posts"
    }
}

In [19]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


tech_docs = {
    "maven": [
        "https://maven.apache.org/pom.html",
        "https://maven.apache.org/ref/3.9.7/maven-model/maven.html",
        "https://maven.apache.org/ref/3.9.7/maven-settings/settings.html",
        "https://maven.apache.org/ref/3.9.7/maven-core/toolchains.html",
        "https://maven.apache.org/guides/getting-started/index.html",
    ],
    "spring-boot": [
        "https://docs.spring.io/spring-boot/appendix/application-properties/index.html",
        "https://docs.spring.io/spring-boot/docs/1.0.1.RELEASE/reference/html/howto-properties-and-configuration.html",
        "https://docs.spring.io/spring-boot/reference/features/external-config.html",
        "https://docs.spring.io/spring-boot/reference/features/profiles.html",
    ],
    "docker-compose": [
        "https://docs.docker.com/compose/compose-file/",
        "https://docs.docker.com/compose/compose-application-model/",
        "https://docs.docker.com/compose/intro/features-uses/",
        "https://docs.docker.com/compose/gettingstarted/",
        "https://docs.docker.com/compose/compose-file/04-version-and-name/",
        "https://docs.docker.com/compose/compose-file/05-services/",
        "https://docs.docker.com/compose/compose-file/06-networks/",
        "https://docs.docker.com/compose/compose-file/07-volumes/",
        "https://docs.docker.com/compose/compose-file/08-configs/",
        "https://docs.docker.com/compose/compose-file/09-secrets/",
        "https://docs.docker.com/compose/compose-file/10-fragments/",
        "https://docs.docker.com/compose/compose-file/11-extension/",
        "https://docs.docker.com/compose/compose-file/12-interpolation/",
        "https://docs.docker.com/compose/compose-file/13-merge/",
        "https://docs.docker.com/compose/compose-file/14-include/",
        "https://docs.docker.com/compose/compose-file/15-profiles/",
        "https://docs.docker.com/compose/compose-file/build/",
        "https://docs.docker.com/compose/compose-file/deploy/",
        "https://docs.docker.com/compose/compose-file/develop/"

    ],
    "docker": [
        "https://docs.docker.com/reference/dockerfile/",
        "https://docs.docker.com/build/building/packaging/",
        "https://docs.docker.com/build/building/multi-stage/",
    ],
}





In [20]:
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

In [21]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor, SummaryExtractor
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.schema import MetadataMode
import glob


config = dotenv_values("../../.env")
pc = Pinecone(api_key=config["PINECONE_API_KEY"])
indexes = pc.list_indexes().names()

embed_model = OpenAIEmbedding(api_key=config["OPENAI_KEY"])
llm = OpenAI(model="gpt-4o", api_key=config["OPENAI_KEY"])
Settings.embed_model = embed_model

# add data
for document_type, values in document_types.items():
    if values["index_name"] not in indexes:
        pc.create_index(
            name=values["index_name"],
            dimension=1536,
            metric="dotproduct",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )


    print("Indexing", document_type)
    pinecone_index = pc.Index(values["index_name"])
    
    # Initialize VectorStore
    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

    cleaned_documents = []

    # Use SimpleWebPageReader for Tech-Docs
    if values["index_name"] == "tech-docs":
        for key, urls in tech_docs.items():
            documents = SimpleWebPageReader(html_to_text=True).load_data(urls)
            for d in documents:
                d.metadata["technology"] = key
                d.metadata["file_name"] = d.doc_id
                cleaned_documents.append(d)
    else:
        for dir in glob.glob(values["data_dir"] + "/**"):
            technology_name = dir.split("/")[-1]
            print(technology_name)


            documents = SimpleDirectoryReader(dir).load_data()

            # clean up documents and add technology name to metadata
            for d in documents: 
                cleaned_text = clean_up_text(d.text)
                d.text = cleaned_text
                d.metadata["technology"] = technology_name
                cleaned_documents.append(d)


    transformations = [
        SentenceSplitter(chunk_size=1024, chunk_overlap=20),
        embed_model,
    ]

    # define the ingestion pipeline
    pipeline = IngestionPipeline(
        transformations=transformations,
        vector_store=vector_store
    )

    # run the ingestion pipeline
    pipeline.run(documents=cleaned_documents)

Indexing DOCUMENTATIONS
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings

Upserted vectors: 100%|██████████| 470/470 [00:11<00:00, 40.91it/s]


Indexing BLOG_POSTS
docker-compose_docker
maven_docker
maven_docker-compose
spring-boot_docker
spring-boot_docker-compose
spring-boot_maven
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Upserted vectors: 100%|██████████| 117/117 [00:03<00:00, 32.99it/s]


Indexing STACK_OVERFLOW_POSTS
docker-compose_maven
docker-compose_spring-boot
docker_docker-compose
docker_maven
docker_spring-boot
spring-boot_maven
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTT

Upserted vectors: 100%|██████████| 873/873 [00:20<00:00, 41.75it/s]
