In [41]:
import numpy as np
import pandas as pd
import json
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
# dataset = load_dataset(
#     "jamescalam/llama-2-arxiv-papers-chunked",
#     split="train"
# )

dataset = load_dataset("not-lain/wikipedia", split='train')

In [17]:
dataset[4]

{'id': '305',
 'url': 'https://en.wikipedia.org/wiki/Achilles',
 'title': 'Achilles',
 'text': 'In Greek mythology, Achilles ( ) or Achilleus () was a hero of the Trojan War who was known as being the greatest of all the Greek warriors. A central character in Homer\'s Iliad, he was the son of the Nereid Thetis and Peleus, king of Phthia and famous Argonaut. Achilles was raised in Phthia along his childhood companion Patroclus and received his education by the centaur Chiron. In the Iliad, he is presented as the commander of the mythical tribe of the Myrmidons. \n\nAchilles\' most notable feat during the Trojan War was the slaying of the Trojan prince Hector outside the gates of Troy. Although the death of Achilles is not presented in the Iliad, other sources concur that he was killed near the end of the Trojan War by Paris, who shot him with an arrow. Later legends (beginning with Statius\' unfinished epic Achilleid, written in the 1st century AD) state that Achilles was invulnerable i

In [25]:
articles = [a["text"] for a in dataset]
metadatas = [{"title": a["title"]} for a in dataset]

In [26]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size   = 1_024,          # ≈ ~350–450 tokens with English prose
    chunk_overlap= 100,            # keeps a bit of context
    separators   = ["\n\n", "\n", " ", ""]   # try big breaks first
)

In [27]:
docs = splitter.create_documents(articles, metadatas=metadatas)

In [44]:
from langchain_huggingface import HuggingFaceEmbeddings

In [45]:
embedding = HuggingFaceEmbeddings()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [46]:
vectorstore = Chroma.from_documents(
    docs,
    embedding=embedding,
    persist_directory="database"
)

In [47]:
vectorstore.persist()

  vectorstore.persist()


In [48]:
retriever = vectorstore.as_retriever(search_kwargs={"k":10})

In [49]:
d = retriever.invoke('Achilles')
d

[Document(metadata={'title': 'Achilles'}, page_content="In Greek mythology, Achilles ( ) or Achilleus () was a hero of the Trojan War who was known as being the greatest of all the Greek warriors. A central character in Homer's Iliad, he was the son of the Nereid Thetis and Peleus, king of Phthia and famous Argonaut. Achilles was raised in Phthia along his childhood companion Patroclus and received his education by the centaur Chiron. In the Iliad, he is presented as the commander of the mythical tribe of the Myrmidons."),
 Document(metadata={'title': 'Achilles'}, page_content='Achilles\' most notable feat during the Trojan War was the slaying of the Trojan prince Hector outside the gates of Troy. Although the death of Achilles is not presented in the Iliad, other sources concur that he was killed near the end of the Trojan War by Paris, who shot him with an arrow. Later legends (beginning with Statius\' unfinished epic Achilleid, written in the 1st century AD) state that Achilles was 

In [37]:
print(d[9].page_content)

motion, which is learned to exclusively beneﬁt a video object detection task. In
our supplementary material, we include more of such results in the video form.
In Figure 6, we also illustrate object detections of the static SSN baseline,
and those of our full STSN model (zoom-in to see the probabilities and class
predictions). In all of these cases, we observe that incorporating temporal information helps STSN to correct the mistakes made by the static baseline. For
instance, in the third row of Figure 6, a static SSN baseline incorrectly labels
an object in the reference frame as a bird, which happens due to the occluded
head of the lizard. However, STSN ﬁxes this mistake by looking at the supporting frames, and by sampling around the lizard body and its head (See Row 3,
Column 1 in Figure 6). Furthermore, in the last row, a static SSN baseline fails
to detect one of the bicycles because it is occluded in the reference frame. STSN
ﬁxes this error, by sampling around the missed bicycle

In [54]:
documents[-300:]

['We observe that after pretraining on 2T Tokens, the models still did not show any sign of saturation.\nTokenizer. Weusethesametokenizeras L/l.sc/a.sc/m.sc/a.sc /one.taboldstyle;itemploysabytepairencoding(BPE)algorithm(Sennrich\netal.,2016)usingtheimplementationfromSentencePiece(KudoandRichardson,2018). Aswith L/l.sc/a.sc/m.sc/a.sc /one.taboldstyle,\nwe split all numbers into individual digits and use bytes to decompose unknown UTF-8 characters. The total\nvocabulary size is 32k tokens.\n2.2.1 Training Hardware & Carbon Footprint\nTrainingHardware. WepretrainedourmodelsonMeta’sResearchSuperCluster(RSC)(LeeandSengupta,\n2022)aswellasinternalproductionclusters. BothclustersuseNVIDIAA100s. Therearetwokeydiﬀerences\nbetween the two clusters, with the ﬁrst being the type of interconnect available: RSC uses NVIDIA Quantum\nInﬁniBandwhileourproductionclusterisequippedwithaRoCE(RDMAoverconvergedEthernet)solution',
 'InﬁniBandwhileourproductionclusterisequippedwithaRoCE(RDMAoverconvergedEthern