## Data Structure

In [1]:
!tree data/

[01;34mdata/[0m
├── [01;34mllm-papers[0m
│   └── openai-paper.pdf
├── [01;34mssi-docs[0m
│   ├── boe-ssi.pdf
│   └── db-ssi.pdf
└── [01;34msystem-documentation[0m
    ├── pandas-basics.html
    ├── pandas-cookbook.html
    └── pandas-ten-minutes.html

3 directories, 6 files


### Imports

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import PyPDFLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings #uses llama2 embeddings

## system-documentation

In [4]:
loader = BSHTMLLoader("data/system-documentation/pandas-basics.html")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [5]:
len(all_splits)
all_splits[0]


131

Document(page_content='Essential basic functionality — pandas 2.2.2 documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\n\n\n\n    Back to top\n  \n\n\n\n\n\n\n\n\n\n\nCtrl+K\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    Site Navigation\n  \n\n\n\n                        Getting started\n                      \n\n\n\n                        User Guide\n                      \n\n\n\n                        API reference\n                      \n\n\n\n                        Development\n                      \n\n\n\n                        Release notes\n                      \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGitHub\n\n\n\nTwitter\n\n\n\nMastodon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    Site Navigation\n  \n\n\n\n                        Getting started\n                      \n\n\n\n                        User Guide\n                      \n\n\n\n                        API reference\n                      \n\n\n\n         

In [6]:
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name="system-documentation",
    embedding=OllamaEmbeddings(),
    collection_metadata={"topic":"system documentation","source": "pandas-basics.html"},
    persist_directory="./chroma_db_test"
)

## ssi-docs

In [7]:
ssi_loader = PyPDFLoader("data/ssi-docs/db-ssi.pdf")
ssi_pages = ssi_loader.load_and_split()

In [8]:
len(ssi_pages)
ssi_pages[0]

10

Document(page_content='Deutsche Bank AG, Frankfurt Cash Equities    \n                           \n \n  \n1 | P a g e   \nJanuary 2021  \n For internal use only                         Standard Settlement Instructions   \n  \nDEUTDEFFEEQ  is the Beneficiary BIC and 7LTWFZYICNSX8D621K86  is the Legal Entity  \nIdentifier (LEI) for all the following accounts   \n  \n  \n  \nCOUNTRY                SETTLEMENT INSTRUCTIONS                      SWIFT CODE   \n  \nARGENTINA                    Industrial & Commercial Bank of China (ARG)                        ICBKARBAGCD                                                                   \nA/C 367963 (stock)  \n                                         Place of settlement = CAVLARBAXXX  \n                                      \n  \n  \nAUSTRALIA                    National Australian Bank                                                            NATAAU3303X                                          \nCHESS PID 20006  \n                 Austraclea

In [9]:
vectorstore = Chroma.from_documents(
    documents=ssi_pages,
    collection_name="ssi-docs",
    embedding=OllamaEmbeddings(),
    collection_metadata={"topic":"ssi docs","source": "db-ssi.pdf"},
    persist_directory="./chroma_db_test"
)

## llm-papers

In [10]:
llm_loader = PyPDFLoader("data/llm-papers/openai-paper.pdf")
llm_pages = llm_loader.load_and_split()

In [11]:
len(llm_pages)
llm_pages[0]

192

Document(page_content='Sparks of Artiﬁcial General Intelligence:\nEarly experiments with GPT-4\nS´ ebastien Bubeck Varun Chandrasekaran Ronen Eldan Johannes Gehrke\nEric Horvitz Ece Kamar Peter Lee Yin Tat Lee Yuanzhi Li Scott Lundberg\nHarsha Nori Hamid Palangi Marco Tulio Ribeiro Yi Zhang\nMicrosoft Research\nAbstract\nArtiﬁcial intelligence (AI) researchers have been developing and reﬁning large language models (LLMs)\nthat exhibit remarkable capabilities across a variety of domains and tasks, challenging our understanding\nof learning and cognition. The latest model developed by OpenAI, GPT-4 [Ope23], was trained using an\nunprecedented scale of compute and data. In this paper, we report on our investigation of an early version\nof GPT-4, when it was still in active development by OpenAI. We contend that (this early version of) GPT-\n4 is part of a new cohort of LLMs (along with ChatGPT and Google’s PaLM for example) that exhibit\nmore general intelligence than previous AI models. 

In [12]:
vectorstore = Chroma.from_documents(
    documents=llm_pages,
    collection_name="llm-papers",
    embedding=OllamaEmbeddings(),
    collection_metadata={"topic":"llm papers","source": "openai-paper.pdf"},
    persist_directory="./chroma_db_test"
)

### Check Collections

In [14]:
!tree chroma_db_test/

[01;34mchroma_db_test/[0m
├── [01;34m25a646c7-4f14-470b-95a5-80304c9f1d6f[0m
│   ├── data_level0.bin
│   ├── header.bin
│   ├── length.bin
│   └── link_lists.bin
├── [01;34m5783fbcf-95ef-40d5-9ef2-57935e07ecb2[0m
│   ├── data_level0.bin
│   ├── header.bin
│   ├── length.bin
│   └── link_lists.bin
├── chroma.sqlite3
└── [01;34me581d79b-7adc-45ee-b8e6-635022156c2e[0m
    ├── data_level0.bin
    ├── header.bin
    ├── length.bin
    └── link_lists.bin

3 directories, 13 files


In [13]:
import chromadb

persistent_client = chromadb.PersistentClient("./chroma_db_test")

persistent_client.list_collections()

[Collection(name=system-documentation),
 Collection(name=llm-papers),
 Collection(name=ssi-docs)]