In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader('test.txt')
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'test.txt'}, page_content='In the ever-evolving field of machine learning, comparing model performance is more than just examining metrics — it’s about confidently determining whether one model truly outperforms another. The paired permutation test emerges as an essential tool in these scenarios, offering a rigorous method to assess whether observed differences in performance are statistically significant. Unlike traditional statistical tests, the paired permutation test is robust to data irregularities and designed to handle related samples, ensuring that your conclusions about model superiority are both reliable and sound.\n\nWhat is a Permutation Test in Machine Learning?\nA permutation test, also known as a randomization test, is a non-parametric statistical significance test. In machine learning, it’s often used to determine whether the difference in model performance (like accuracy, AUC, etc.) between two models or datasets is statistically significa

In [None]:
loader

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

In [None]:
# Load, chunk, and index the content of the html page
loader = WebBaseLoader(web_paths=('https://sanbuddhacharyas.github.io/',),
                      bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=('home-intro')))
                      )

text_document = loader.load()
text_document                 

In [None]:
# Read from the pdf
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('CV_Professional_pure_data_science (4).pdf')
text_doc = loader.load()
text_doc

In [2]:
# Convert the whole doc into chunk
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
document     = text_splitter.split_documents(text_documents)
document

[Document(metadata={'source': 'test.txt'}, page_content='In the ever-evolving field of machine learning, comparing model performance is more than just examining metrics — it’s about confidently determining whether one model truly outperforms another. The paired permutation test emerges as an essential tool in these scenarios, offering a rigorous method to assess whether observed differences in performance are statistically significant. Unlike traditional statistical tests, the paired permutation test is robust to data irregularities and designed to handle related samples, ensuring that your conclusions about model superiority are both reliable and sound.'),
 Document(metadata={'source': 'test.txt'}, page_content='What is a Permutation Test in Machine Learning?\nA permutation test, also known as a randomization test, is a non-parametric statistical significance test. In machine learning, it’s often used to determine whether the difference in model performance (like accuracy, AUC, etc.) 

In [3]:
len(document)

2

In [4]:
# Vector Embeddings and Vector store
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [5]:
db = Chroma.from_documents(document, OllamaEmbeddings())

In [6]:
## Vector database
query = "What is paired permutation"
result = db.similarity_search(query)
result

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


[Document(metadata={'source': 'test.txt'}, page_content='In the ever-evolving field of machine learning, comparing model performance is more than just examining metrics — it’s about confidently determining whether one model truly outperforms another. The paired permutation test emerges as an essential tool in these scenarios, offering a rigorous method to assess whether observed differences in performance are statistically significant. Unlike traditional statistical tests, the paired permutation test is robust to data irregularities and designed to handle related samples, ensuring that your conclusions about model superiority are both reliable and sound.'),
 Document(metadata={'source': 'test.txt'}, page_content='What is a Permutation Test in Machine Learning?\nA permutation test, also known as a randomization test, is a non-parametric statistical significance test. In machine learning, it’s often used to determine whether the difference in model performance (like accuracy, AUC, etc.) 

In [7]:
result[0].page_content

'In the ever-evolving field of machine learning, comparing model performance is more than just examining metrics — it’s about confidently determining whether one model truly outperforms another. The paired permutation test emerges as an essential tool in these scenarios, offering a rigorous method to assess whether observed differences in performance are statistically significant. Unlike traditional statistical tests, the paired permutation test is robust to data irregularities and designed to handle related samples, ensuring that your conclusions about model superiority are both reliable and sound.'

In [None]:
## FAISS Vector Database
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(document, OllamaEmbeddings())