# 2.1 Document Loading

## Setup

### Install dependencies

In [None]:
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install "unstructured[md]~=0.16.5" nltk~=3.9 pypdf~=5.1 --upgrade --quiet
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 --upgrade --quiet

# If running locally, you can do this instead:
#%pip install -r ../requirements.txt

### Load environment variables

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# If running in Google Colab, you can use this code instead:
# from google.colab import userdata
# os.environ["AZURE_OPENAI_API_KEY"] = userdata.get("AZURE_OPENAI_API_KEY")
# os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get("AZURE_OPENAI_ENDPOINT")

### Setup path to data 

In [None]:
data_path = "../data"

## PDFs

Let's load a PDF [transcript](https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf) from Andrew Ng's famous CS229 course! These documents are the result of automated transcription so words and sentences are sometimes split unexpectedly.

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(f"{data_path}/MachineLearning-Lecture01.pdf")
pages = loader.load()

**Each page** is a `Document`.

A `Document` contains text (`page_content`) and `metadata`.

In [None]:
len(pages)

In [None]:
page = pages[0]

In [None]:
print(page.page_content[0:500])

In [None]:
page.metadata

## Markdown

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

markdown_path = f"{data_path}/markdown.md"
loader = UnstructuredMarkdownLoader(markdown_path)

data = loader.load()
assert len(data) == 1 # Only one document will be created when using UnstructuredMarkdownLoader
assert isinstance(data[0], Document)
readme_content = data[0].page_content
print(readme_content[:250])

## YouTube

In [None]:
from langchain_community.document_loaders import YoutubeLoader


In [None]:
%pip install --upgrade --quiet youtube-transcript-api

In [None]:
#url="https://www.youtube.com/watch?v=XC7BeLRm7ak"
url="https://www.youtube.com/watch?v=tflYCulLYiI"
loader = YoutubeLoader.from_youtube_url(
    url, language="sv", add_video_info=False
)
docs = loader.load()
assert len(data) == 1 # Only one document will be created when using YoutubeLoader

In [None]:
docs[0].page_content[0:500]

In [None]:
# In case you want to save the contents to a file... 
# all_content = "".join([doc.page_content for doc in docs])
 
# Save the concatenated content to a file
#with open("output.txt", "w") as file:
#    file.write(all_content)


## Web Page

In [None]:
from langchain.document_loaders import WebBaseLoader

page_url = "https://world.hey.com/dhh/open-source-royalty-and-mad-kings-a8f79d16"
loader = WebBaseLoader(page_url)

In [None]:
docs = loader.load()

In [None]:
print(docs[0].page_content[:500])

### A bit more advanced parsing, given pre-knowledge of the page structure 

In [None]:
import bs4
loader = WebBaseLoader(
    web_paths=[page_url],
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(name="article"),
    },
    #bs_get_text_kwargs={"separator": " | ", "strip": True},
)

docs = []
async for doc in loader.alazy_load():
    docs.append(doc)

assert len(docs) == 1
doc = docs[0]

In [None]:
print(f"{doc.metadata}\n")
print(doc.page_content[:500])