### Data Ingestion or Data Loader
https://python.langchain.com/docs/integrations/document_loaders/

In [1]:
# Text Loader
from langchain_community.document_loaders import TextLoader
loader=TextLoader("data.txt")
# Load the documents
documents = loader.load()
# Print the loaded documents
print(documents)

[Document(metadata={'source': 'data.txt'}, page_content='Retrieval-augmented generation (RAG) is a technique that enables large language models (LLMs) to retrieve and incorporate new information.[1] With RAG, LLMs do not respond to user queries until they refer to a specified set of documents. These documents supplement information from the LLM\'s pre-existing training data.[2] This allows LLMs to use domain-specific and/or updated information that is not available in the training data.[2][3] For example, this helps LLM-based chatbots access internal company data or generate responses based on authoritative sources.\n\nRAG improves large language models (LLMs) by incorporating information retrieval before generating responses.[4] Unlike traditional LLMs that rely on static training data, RAG pulls relevant text from databases, uploaded documents, or web sources.[1] According to Ars Technica, "RAG is a way of improving LLM performance, in essence by blending the LLM process with a web s

In [None]:
# Read a Pdf File
# using PyPDFLoader from langchain_community.document_loaders which is a document loader for PDF files. PyPDFLoader is used to load and process PDF documents, extracting text and metadata for further analysis or processing in the LangChain framework.
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("stroy.pdf")
# Load the documents
documents = loader.load()
# Print the loaded documents
print(documents)

[Document(metadata={'producer': 'Acrobat 10.1.4', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2013-04-19T07:30:00+08:00', 'moddate': '2013-04-19T11:06:58+08:00', 'title': '', 'source': 'stroy.pdf', 'total_pages': 90, 'page': 0, 'page_label': '1'}, page_content='For students of English as a Foreign Language\nEdgar Allan Poe:\nstoryteller\namerican literary classics\nEdgar Allan Poe'), Document(metadata={'producer': 'Acrobat 10.1.4', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2013-04-19T07:30:00+08:00', 'moddate': '2013-04-19T11:06:58+08:00', 'title': '', 'source': 'stroy.pdf', 'total_pages': 90, 'page': 1, 'page_label': '2'}, page_content='Edgar Allan Poe:\nstoryteller\np\nSeven Stories  \nAdapted from \nEdgar Allan Poe\np\nA Ladder Edition at the 4,000-word level\np'), Document(metadata={'producer': 'Acrobat 10.1.4', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2013-04-19T07:30:00+08:00', 'moddate': '2013-04-19T11:06:58+08:00', 'tit

In [None]:
## Web based loder
# This code snippet demonstrates how to use the WebBaseLoader from langchain_community.document_loaders to load documents from a web page, specifically from Wikipedia on the topic of "Prompt engineering".
# It imports the necessary loader, creates an instance with the specified URL, loads the documents, and prints the loaded documents.
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Prompt_engineering")
# Load the documents
documents = loader.load()

# Print the loaded documents
print(documents)

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Prompt_engineering', 'title': 'Prompt engineering - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nPrompt engineering - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top

In [None]:
## Web based loader using SoupStrainer
# This example uses the `WebBaseLoader` to load a specific section of a webpage using BeautifulSoup's `SoupStrainer`.
# This allows you to filter the content you want to load, such as a specific class or id from the HTML.
# This is useful when you want to avoid loading the entire page and focus on a specific part of the content.


from langchain_community.document_loaders import WebBaseLoader
import bs4
loader=WebBaseLoader(web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/",),bs_kwargs=dict(parse_only=bs4.SoupStrainer(
    class_=("post-content", "markdown-body", "markdown"))
)).load()
# Print the loaded documents
print(loader)

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.\n\n\nMemory\n\nShort-term memory: I would consi

In [12]:
# Arxiv Loader
# ArXivLoader is a document loader specifically designed to load and process documents from the arXiv preprint repository. It is part of the LangChain framework, which provides tools for working with language models and document processing.
# The ArXivLoader allows users to fetch and parse documents from arXiv, extracting
# relevant information such as titles, abstracts, authors, and full text. This is particularly useful for researchers and developers who want to work with academic papers and preprints in their applications.
from langchain_community.document_loaders import ArxivLoader
loader = ArxivLoader(query="1706.03798",load_max_docs=3).load()
print(loader)

[Document(metadata={'Published': '2017-06-12', 'Title': 'Ten Billion Years of Brightest Cluster Galaxy Alignments', 'Authors': 'Michael J. West, Roberto De Propris, Malcolm N. Bremer, Steven Phillipps', 'Summary': "A galaxy's orientation is one of its most basic observable properties.\nAstronomers once assumed that galaxies are randomly oriented in space, however\nit is now clear that some have preferred orientations with respect to their\nsurroundings. Chief among these are giant elliptical galaxies found in the\ncenters of rich galaxy clusters. Numerous studies have shown that the major\naxes of these galaxies often share the same orientation as the surrounding\nmatter distribution on larger scales. Using Hubble Space Telescope observations\nof 65 distant galaxy clusters, we show for the first time that similar\nalignments are seen at earlier epochs when the universe was only one-third its\ncurrent age. These results suggest that the brightest galaxies in clusters are\nthe product of

In [13]:
len(loader)

1

In [14]:
## Read a CSV File
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path="data.csv")
# Load the documents        
documents = loader.load()
# Print the loaded documents
print(documents)

[Document(metadata={'source': 'data.csv', 'row': 0}, page_content='Industry: Accounting/Finance'), Document(metadata={'source': 'data.csv', 'row': 1}, page_content='Industry: Advertising/Public Relations'), Document(metadata={'source': 'data.csv', 'row': 2}, page_content='Industry: Aerospace/Aviation'), Document(metadata={'source': 'data.csv', 'row': 3}, page_content='Industry: Arts/Entertainment/Publishing'), Document(metadata={'source': 'data.csv', 'row': 4}, page_content='Industry: Automotive'), Document(metadata={'source': 'data.csv', 'row': 5}, page_content='Industry: Banking/Mortgage'), Document(metadata={'source': 'data.csv', 'row': 6}, page_content='Industry: Business Development'), Document(metadata={'source': 'data.csv', 'row': 7}, page_content='Industry: Business Opportunity'), Document(metadata={'source': 'data.csv', 'row': 8}, page_content='Industry: Clerical/Administrative'), Document(metadata={'source': 'data.csv', 'row': 9}, page_content='Industry: Construction/Faciliti

In [15]:
## Wikipedia Loader
# The WikipediaLoader is a document loader specifically designed to load and process documents from Wikipedia. It is part of the LangChain framework, which provides tools for working with language models and document processing.
# The WikipediaLoader allows users to fetch and parse documents from Wikipedia, extracting relevant information such as titles, summaries, and full text. This is particularly useful for applications that require access to a wide range of knowledge and information available on Wikipedia.
from langchain_community.document_loaders import WikipediaLoader
docs = WikipediaLoader(query="Generative AI", load_max_docs=3).load()
print(docs)



Use your own keys to show output of the below code snippet

In [None]:
# NotionDB Loader
# The NotionDBLoader is a document loader specifically designed to load and process documents from Notion databases. It is part of the LangChain framework, which provides tools for working with language models and document processing.
# The NotionDBLoader allows users to fetch and parse documents from Notion, extracting relevant
# information such as titles, content, and metadata. This is particularly useful for applications that require access to structured data stored in Notion databases.
from langchain_community.document_loaders import NotionDBLoader
loader = NotionDBLoader(
    notion_api_key="your_notion_api_key",
    database_id="your_database_id"
).load()
# Print the loaded documents
print(loader)