In [18]:
import nest_asyncio
nest_asyncio.apply()

In [139]:
from llama_index.readers.github import GithubRepositoryReader, GithubClient

github_token = "github_pat_..."

client = github_client = GithubClient(github_token=github_token, verbose=False)

reader = GithubRepositoryReader(
    github_client=github_client,
    owner="streamlit",
    repo="docs",
    use_parser=False,
    verbose=True,
    filter_directories=(
        ["content"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
    filter_file_extensions=(
        [".md"],
        GithubRepositoryReader.FilterType.INCLUDE,
    )
)

In [19]:

documents = reader.load_data(branch="main")

current path: 
tree data: GitTreeResponseModel(sha='7e321edfb0977623e91faca009c6289988e08b56', url='https://api.github.com/repos/streamlit/docs/git/trees/7e321edfb0977623e91faca009c6289988e08b56', tree=[GitTreeResponseModel.GitTreeObject(path='.devcontainer', mode='040000', type='tree', sha='315fff0634bd9d935113f8e509abf6115b8aaa44', url='https://api.github.com/repos/streamlit/docs/git/trees/315fff0634bd9d935113f8e509abf6115b8aaa44', size=None), GitTreeResponseModel.GitTreeObject(path='.github', mode='040000', type='tree', sha='17a7444b275235e6d12f6badbad3bf6ad2590c5b', url='https://api.github.com/repos/streamlit/docs/git/trees/17a7444b275235e6d12f6badbad3bf6ad2590c5b', size=None), GitTreeResponseModel.GitTreeObject(path='.gitignore', mode='100644', type='blob', sha='2d7f31fab6c96ac8bd941545ff731254c80feda6', url='https://api.github.com/repos/streamlit/docs/git/blobs/2d7f31fab6c96ac8bd941545ff731254c80feda6', size=1728), GitTreeResponseModel.GitTreeObject(path='.husky', mode='040000', 

In [40]:
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    unwanted_patterns = ['---\nvisible: false','---', '#','slug:']
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Remove all slugs starting with a \ and stopping at the first space
    content = re.sub(r'\\slug: [^\s]*', '', content)

    # normalize whitespace
    content = re.sub(r'\s+', ' ', content)
    return content

In [42]:
cleaned_documents = []

for d in documents:
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_documents.append(d)

In [93]:
from pinecone import Pinecone, ServerlessSpec
pinecone_api_key = "..."
pc = Pinecone(api_key=pinecone_api_key)


index_name = "streamlit-docs"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

pinecone_index = pc.Index(index_name)

In [94]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding("Snowflake/snowflake-arctic-embed-m")

from llama_index.core.node_parser import SemanticSplitterNodeParser

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

In [95]:
nodes = splitter.get_nodes_from_documents(cleaned_documents)

In [96]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# setup our storage (vector db)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

index = VectorStoreIndex.from_documents(cleaned_documents, storage_context=storage_context, embed_model=embed_model)


[A
[A
[A
[A
[A
[A
Upserted vectors: 100%|██████████| 566/566 [00:04<00:00, 126.38it/s]
