In [1]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import dotenv_values

document_types = {
    "DOCUMENTATIONS": {
        "index_name": "technology-docs",
        "data_dir": "../../data/tech_docs"
    },
    "BLOG_POSTS": {
        "index_name": "blog-posts",
        "data_dir": "../../data/blog_posts"
    },
    "STACK_OVERFLOW_POSTS": {
        "index_name": "so-posts",
        "data_dir": "../../data/so_posts"
    },
    "WEB_SEARCH": {
        "index_name": "web-search",
        "data_dir": "../../data/web-search"
    }
}

  from tqdm.autonotebook import tqdm


In [2]:
config = dotenv_values("../../.env")
pc = Pinecone(api_key=config["PINECONE_API_KEY"])
indexes = pc.list_indexes().names()
print(indexes)

['blog-posts', 'technology-docs', 'so-posts']


In [3]:
# create indexes
for document_type, values in document_types.items():
    if values["index_name"] not in indexes: 
        print(f"Create index for {document_type}")
        pc.create_index(
            name=values["index_name"],
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )

In [4]:
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

In [5]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pathlib import Path
import glob
import uuid
import os


embed_model = OpenAIEmbedding(api_key=config["OPENAI_KEY"])

# add data
for document_type, values in document_types.items():
    if not document_type == "STACK_OVERFLOW_POSTS":
        continue
    
    print("Indexing", document_type)
    pinecone_index = pc.Index(values["index_name"])
    #pinecone_index.delete(deleteAll="true")
    
   
    # Initialize VectorStore
    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

    cleaned_documents = []

    if not os.path.exists(values["data_dir"]):
        continue

    for dir in glob.glob(values["data_dir"] + "/**"):
        technology_name = dir.split("/")[-1]
        print(technology_name)


        documents = SimpleDirectoryReader(dir).load_data()

        # clean up documents and add technology name to metadata
        for d in documents: 
            cleaned_text = clean_up_text(d.text)
            d.text = cleaned_text
            d.metadata["technology"] = technology_name
            cleaned_documents.append(d)

    # define the ingestion pipeline
    pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
        vector_store=vector_store
    )

    # run the ingestion pipeline
    pipeline.run(documents=cleaned_documents)

    

Indexing STACK_OVERFLOW_POSTS
docker-compose_maven
docker-compose_spring-boot
docker_docker-compose
docker_maven
docker_spring-boot
spring-boot_maven


Upserted vectors: 100%|██████████| 1130/1130 [00:25<00:00, 44.00it/s]
