# LangChain - Document Loaders

## CSV Loader

In [1]:
from langchain.document_loaders import CSVLoader

In [2]:
csv_loader = CSVLoader('some_data/penguins.csv')

In [3]:
data = csv_loader.load()

In [4]:
data

[Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: 39.1\nbill_depth_mm: 18.7\nflipper_length_mm: 181\nbody_mass_g: 3750\nsex: MALE', metadata={'source': 'some_data/penguins.csv', 'row': 0}),
 Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: 39.5\nbill_depth_mm: 17.4\nflipper_length_mm: 186\nbody_mass_g: 3800\nsex: FEMALE', metadata={'source': 'some_data/penguins.csv', 'row': 1}),
 Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: 40.3\nbill_depth_mm: 18\nflipper_length_mm: 195\nbody_mass_g: 3250\nsex: FEMALE', metadata={'source': 'some_data/penguins.csv', 'row': 2}),
 Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: \nbill_depth_mm: \nflipper_length_mm: \nbody_mass_g: \nsex: ', metadata={'source': 'some_data/penguins.csv', 'row': 3}),
 Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: 36.7\nbill_depth_mm: 19.3\nflipper_length_mm: 193\nbody_mass_g: 3450\nsex: FE

In [5]:
type(data)

list

In [6]:
type(data[0])

langchain.schema.document.Document

In [7]:
data[0]

Document(page_content='species: Adelie\nisland: Torgersen\nbill_length_mm: 39.1\nbill_depth_mm: 18.7\nflipper_length_mm: 181\nbody_mass_g: 3750\nsex: MALE', metadata={'source': 'some_data/penguins.csv', 'row': 0})

In [8]:
data[0].page_content

'species: Adelie\nisland: Torgersen\nbill_length_mm: 39.1\nbill_depth_mm: 18.7\nflipper_length_mm: 181\nbody_mass_g: 3750\nsex: MALE'

In [9]:
print(data[0].page_content)

species: Adelie
island: Torgersen
bill_length_mm: 39.1
bill_depth_mm: 18.7
flipper_length_mm: 181
body_mass_g: 3750
sex: MALE


In [10]:
print(data[3].page_content)

species: Adelie
island: Torgersen
bill_length_mm: 
bill_depth_mm: 
flipper_length_mm: 
body_mass_g: 
sex: 


In [11]:
print(data[2].metadata)

{'source': 'some_data/penguins.csv', 'row': 2}


## HTML Loader

In [12]:
from langchain.document_loaders import BSHTMLLoader

In [13]:
html_loader = BSHTMLLoader('some_data/some_website.html')

In [14]:
data = html_loader.load()

In [15]:
data

[Document(page_content='Heading 1', metadata={'source': 'some_data/some_website.html', 'title': ''})]

In [16]:
data[0]

Document(page_content='Heading 1', metadata={'source': 'some_data/some_website.html', 'title': ''})

In [17]:
data[0].page_content

'Heading 1'

## PDF Loader

In [18]:
from langchain.document_loaders import PyPDFLoader

In [19]:
pdf_loader = PyPDFLoader('some_data/SomeReport.pdf')

In [20]:
pages = pdf_loader.load()

In [21]:
pages

[Document(page_content='This\nis\nthe\nfirst\nline\nPDF.\nThis\nis\nthe\nsecond\nline\nin\nthe\nPDF.\nThis\nis\nthe\nthird\nline\nin\nthe\nPDF.', metadata={'source': 'some_data/SomeReport.pdf', 'page': 0})]

In [22]:
print(pages[0].page_content)

This
is
the
first
line
PDF.
This
is
the
second
line
in
the
PDF.
This
is
the
third
line
in
the
PDF.


In [23]:
print(pages[0].page_content.replace('\n', ' '))

This is the first line PDF. This is the second line in the PDF. This is the third line in the PDF.
