## 1. Document Loaders 示例

### 1. 文本文件加载器

In [14]:
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
import os

def text_loader_examples():
    """文本文件加载器示例"""

    # 1.1 基础文本加载
    with open("sample.txt", "w", encoding="utf-8") as f:
        f.write("人工智能是计算机科学的一个分支。\n机器学习是AI的子集。")

    loader = TextLoader("sample.txt", encoding="utf-8")
    documents = loader.load()
    print(f"文档数量: {len(documents)}")
    print(f"内容: {documents[0].page_content}")
    print(f"元数据: {documents[0].metadata}")

    # # 1.2 处理大文件
    # loader_large = TextLoader("large_file.txt", encoding="utf-8")
    # try:
    #     docs = loader_large.load()
    #     print(f"大文件加载成功，文档数: {len(docs)}")
    # except Exception as e:
    #     print(f"加载失败: {e}")

    # # 1.3 自动编码检测
    # loader_auto = TextLoader("file.txt", autodetect_encoding=True)
    # docs = loader_auto.load()
text_loader_examples()

文档数量: 1
内容: 人工智能是计算机科学的一个分支。
机器学习是AI的子集。
元数据: {'source': 'sample.txt'}


### 2. PDF 文档加载器

In [20]:
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader, PDFPlumberLoader

def pdf_loader_examples():
    """PDF加载器示例"""

    # 2.1 PyPDFLoader - 最常用
    pdf_loader = PyPDFLoader("docs/Multi-level Wavelet-CNN for Image Restoration.pdf")
    pages = pdf_loader.load()
    print(f"PDF页数: {len(pages)}")

    for i, page in enumerate(pages[:2]):
        print(f"第{i+1}页内容: {page.page_content[:100]}...")
        print(f"页面元数据: {page.metadata}")

    # 2.2 PDFMinerLoader - 更好的文本提取
    pdf_miner_loader = PDFMinerLoader("docs/Multi-level Wavelet-CNN for Image Restoration.pdf")
    docs = pdf_miner_loader.load()

    # 2.3 PDFPlumberLoader - 表格处理更好
    pdf_plumber_loader = PDFPlumberLoader("docs/Multi-level Wavelet-CNN for Image Restoration.pdf")
    docs = pdf_plumber_loader.load()

    # 2.4 分页加载
    pdf_loader = PyPDFLoader("docs/Multi-level Wavelet-CNN for Image Restoration.pdf")
    pages = pdf_loader.load_and_split()

    # # 2.5 密码保护的PDF
    # protected_loader = PyPDFLoader("docs/Multi-level Wavelet-CNN for Image Restoration.pdf", password="password123")
    # docs = protected_loader.load()

pdf_loader_examples()

PDF页数: 10
第1页内容: Multi-level Wavelet-CNN for Image Restoration
Pengju Liu1, Hongzhi Zhang ∗1, Kai Zhang1, Liang Lin2,...
页面元数据: {'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2018-05-23T00:19:39+00:00', 'author': '', 'keywords': '', 'moddate': '2018-05-23T00:19:39+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'docs/Multi-level Wavelet-CNN for Image Restoration.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}
第2页内容: is adopted to enlarge receptive ﬁeld without the sacriﬁce
of computational cost. Dilated ﬁltering, h...
页面元数据: {'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2018-05-23T00:19:39+00:00', 'author': '', 'keywords': '', 'moddate': '2018-05-23T00:19:39+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) kpathsea version 6.2.

### 3. CSV 数据加载器

In [None]:
from langchain_community.document_loaders import CSVLoader
import pandas as pd

def csv_loader_examples():
    """CSV加载器示例"""

    # 创建示例CSV
    df = pd.DataFrame({
        'name': ['张三', '李四', '王五'],
        'age': [25, 30, 35],
        'department': ['技术部', '销售部', '市场部'],
        'description': ['Python开发工程师', '销售经理', '市场专员']
    })
    df.to_csv("employees.csv", index=False, encoding="utf-8")

    # 3.1 基础CSV加载
    csv_loader = CSVLoader("employees.csv", encoding="utf-8")
    docs = csv_loader.load()
    print(f"CSV文档数量: {len(docs)}")
    print(f"第一条记录: {docs[0].page_content}")

    # 3.2 指定源列
    csv_loader_with_source = CSVLoader(
        "employees.csv",
        source_column="name",
        encoding="utf-8"
    )
    docs = csv_loader_with_source.load()

    # 3.3 自定义CSV参数
    csv_loader_custom = CSVLoader(
        "employees.csv",
        csv_args={
            'delimiter': ',',
            'quotechar': '"',
            'fieldnames': ['姓名', '年龄', '部门', '描述']
        }
    )
    docs = csv_loader_custom.load()

    # 3.4 过滤特定列
    csv_loader_filtered = CSVLoader(
        "employees.csv",
        content_columns=['name', 'description'],
        encoding="utf-8"
    )
    docs = csv_loader_filtered.load()
csv_loader_examples()

### 4. JSON 数据加载器

In [None]:
from langchain_community.document_loaders import JSONLoader
import json

def json_loader_examples():
    """JSON加载器示例"""

    # 创建示例JSON数据
    data = [
        {
            "id": 1,
            "title": "Python编程指南",
            "content": "Python是一种高级编程语言，语法简洁优雅。",
            "author": "张三",
            "tags": ["编程", "Python", "教程"]
        },
        {
            "id": 2,
            "title": "机器学习入门",
            "content": "机器学习是人工智能的一个重要分支。",
            "author": "李四",
            "tags": ["AI", "机器学习", "数据科学"]
        }
    ]

    with open("articles.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # 4.1 提取特定字段
    json_loader = JSONLoader(
        "articles.json",
        jq_schema=".[].content",
        text_content=False
    )
    docs = json_loader.load()
    print(f"JSON文档数量: {len(docs)}")

    # 4.2 提取多个字段
    json_loader_multi = JSONLoader(
        "articles.json",
        jq_schema=".[]",
        content_key="content"
    )
    docs = json_loader_multi.load()

    # 4.3 复杂JSON结构
    complex_data = {
        "articles": {
            "tech": [
                {"title": "AI发展", "body": "人工智能快速发展"},
                {"title": "云计算", "body": "云计算改变了IT架构"}
            ],
            "business": [
                {"title": "数字化转型", "body": "企业数字化转型势在必行"}
            ]
        }
    }

    with open("complex.json", "w", encoding="utf-8") as f:
        json.dump(complex_data, f, ensure_ascii=False, indent=2)

    # 提取嵌套数据
    json_loader_nested = JSONLoader(
        "complex.json",
        jq_schema=".articles.tech[].body"
    )
    docs = json_loader_nested.load()

    # 4.4 JSONL格式
    jsonl_data = [
        {"text": "第一行数据", "label": "A"},
        {"text": "第二行数据", "label": "B"}
    ]

    with open("data.jsonl", "w", encoding="utf-8") as f:
        for item in jsonl_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    from langchain_community.document_loaders import JSONLinesLoader
    jsonl_loader = JSONLinesLoader("data.jsonl", jq_schema=".text")
    docs = jsonl_loader.load()

### 5. 网页内容加载器

In [None]:
from langchain_community.document_loaders import WebBaseLoader, AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

def web_loader_examples():
    """网页加载器示例"""

    # 5.1 基础网页加载
    web_loader = WebBaseLoader("https://example.com")
    docs = web_loader.load()
    print(f"网页文档: {docs[0].page_content[:200]}...")

    # 5.2 多个URL批量加载
    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3"
    ]
    web_loader_multi = WebBaseLoader(urls)
    docs = web_loader_multi.load()

    # 5.3 自定义请求头
    web_loader_headers = WebBaseLoader(
        "https://api.example.com/data",
        header_template={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Authorization": "Bearer your-token"
        }
    )
    docs = web_loader_headers.load()

    # 5.4 CSS选择器过滤
    from bs4 import BeautifulSoup

    web_loader_css = WebBaseLoader(
        "https://news.example.com",
        bs_kwargs={
            "parse_only": BeautifulSoup.SoupStrainer("div", {"class": "article-content"})
        }
    )
    docs = web_loader_css.load()

    # 5.5 异步网页加载
    async def async_web_loading():
        urls = ["https://example.com/1", "https://example.com/2"]
        async_loader = AsyncHtmlLoader(urls)
        html_docs = async_loader.load()

        # HTML转文本
        html2text = Html2TextTransformer()
        text_docs = html2text.transform_documents(html_docs)
        return text_docs

    # 5.6 处理JavaScript渲染页面
    from langchain_community.document_loaders import SeleniumURLLoader

    selenium_loader = SeleniumURLLoader(
        urls=["https://spa-example.com"],
        browser="chrome",
        headless=True
    )
    docs = selenium_loader.load()

### 6. 目录批量加载器

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader, PyPDFLoader, CSVLoader

def directory_loader_examples():
    """目录加载器示例"""

    # 创建测试目录结构
    os.makedirs("documents/texts", exist_ok=True)
    os.makedirs("documents/pdfs", exist_ok=True)
    os.makedirs("documents/data", exist_ok=True)

    # 创建测试文件
    for i in range(3):
        with open(f"documents/texts/doc_{i}.txt", "w", encoding="utf-8") as f:
            f.write(f"这是文档{i}的内容，包含重要信息。")

    # 6.1 加载特定类型文件
    txt_loader = DirectoryLoader(
        "documents/texts",
        glob="*.txt",
        loader_cls=TextLoader,
        loader_kwargs={"encoding": "utf-8"}
    )
    txt_docs = txt_loader.load()
    print(f"文本文档数量: {len(txt_docs)}")

    # 6.2 多种文件类型混合加载
    from langchain_community.document_loaders import UnstructuredFileLoader

    mixed_loader = DirectoryLoader(
        "documents",
        glob="**/*",  # 递归搜索
        loader_cls=UnstructuredFileLoader,
        recursive=True,
        show_progress=True
    )
    mixed_docs = mixed_loader.load()

    # 6.3 自定义文件类型映射
    def get_loader_for_file(file_path: str):
        if file_path.endswith('.txt'):
            return TextLoader(file_path, encoding="utf-8")
        elif file_path.endswith('.pdf'):
            return PyPDFLoader(file_path)
        elif file_path.endswith('.csv'):
            return CSVLoader(file_path, encoding="utf-8")
        else:
            return UnstructuredFileLoader(file_path)

    # 6.4 过滤和排除文件
    filtered_loader = DirectoryLoader(
        "documents",
        glob="*.txt",
        exclude=["temp_*", "*.tmp"],
        loader_cls=TextLoader,
        loader_kwargs={"encoding": "utf-8"}
    )
    filtered_docs = filtered_loader.load()

    # 6.5 并行加载
    parallel_loader = DirectoryLoader(
        "documents",
        glob="**/*",
        loader_cls=UnstructuredFileLoader,
        use_multithreading=True,
        max_concurrency=4
    )
    parallel_docs = parallel_loader.load()

### 7. 数据库加载器

In [None]:
from langchain_community.document_loaders import SQLDatabaseLoader
from sqlalchemy import create_engine, text

def database_loader_examples():
    """数据库加载器示例"""

    # 7.1 SQLite数据库加载
    engine = create_engine("sqlite:///example.db")

    # 创建示例表和数据
    with engine.connect() as conn:
        conn.execute(text("""
            CREATE TABLE IF NOT EXISTS articles (
                id INTEGER PRIMARY KEY,
                title TEXT,
                content TEXT,
                author TEXT,
                created_at TIMESTAMP
            )
        """))

        conn.execute(text("""
            INSERT OR REPLACE INTO articles VALUES
            (1, 'Python教程', 'Python是一种编程语言', '张三', '2024-01-01'),
            (2, 'AI发展', '人工智能快速发展', '李四', '2024-01-02')
        """))
        conn.commit()

    # 加载数据库内容
    db_loader = SQLDatabaseLoader(
        query="SELECT title, content, author FROM articles",
        db=engine,
        page_content_columns=["title", "content"],
        metadata_columns=["author"]
    )
    docs = db_loader.load()
    print(f"数据库文档数量: {len(docs)}")

    # 7.2 PostgreSQL示例
    # pg_engine = create_engine("postgresql://user:password@localhost/dbname")
    # pg_loader = SQLDatabaseLoader(
    #     query="SELECT * FROM documents WHERE category = 'tech'",
    #     db=pg_engine
    # )
    # pg_docs = pg_loader.load()

    # 7.3 MongoDB加载器
    from langchain_community.document_loaders import MongodbLoader

    # mongodb_loader = MongodbLoader(
    #     connection_string="mongodb://localhost:27017/",
    #     db_name="mydb",
    #     collection_name="documents",
    #     filter_criteria={"status": "published"}
    # )
    # mongo_docs = mongodb_loader.load()

### 8. 云存储加载器

In [None]:
def cloud_storage_examples():
    """云存储加载器示例"""

    # 8.1 AWS S3加载器
    from langchain_community.document_loaders import S3DirectoryLoader, S3FileLoader

    # 单个S3文件
    s3_file_loader = S3FileLoader(
        bucket="my-bucket",
        key="documents/report.pdf"
    )
    s3_docs = s3_file_loader.load()

    # S3目录
    s3_dir_loader = S3DirectoryLoader(
        bucket="my-bucket",
        prefix="documents/",
        aws_access_key_id="your-access-key",
        aws_secret_access_key="your-secret-key"
    )
    s3_dir_docs = s3_dir_loader.load()

    # 8.2 Google Drive加载器
    from langchain_community.document_loaders import GoogleDriveLoader

    # gdrive_loader = GoogleDriveLoader(
    #     folder_id="your-folder-id",
    #     credentials_path="path/to/credentials.json",
    #     token_path="path/to/token.json"
    # )
    # gdrive_docs = gdrive_loader.load()

    # 8.3 Azure Blob Storage
    from langchain_community.document_loaders import AzureBlobStorageContainerLoader

    # azure_loader = AzureBlobStorageContainerLoader(
    #     conn_str="your-connection-string",
    #     container="documents"
    # )
    # azure_docs = azure_loader.load()

### 9. 自定义文档加载器

In [None]:
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from typing import List, Iterator
import requests

class CustomAPILoader(BaseLoader):
    """自定义API加载器"""

    def __init__(self, api_url: str, headers: dict = None):
        self.api_url = api_url
        self.headers = headers or {}

    def load(self) -> List[Document]:
        """加载文档"""
        response = requests.get(self.api_url, headers=self.headers)
        response.raise_for_status()

        data = response.json()
        documents = []

        for item in data.get('items', []):
            doc = Document(
                page_content=item.get('content', ''),
                metadata={
                    'source': self.api_url,
                    'id': item.get('id'),
                    'title': item.get('title'),
                    'timestamp': item.get('created_at')
                }
            )
            documents.append(doc)

        return documents

    def lazy_load(self) -> Iterator[Document]:
        """懒加载文档"""
        response = requests.get(self.api_url, headers=self.headers)
        response.raise_for_status()

        data = response.json()
        for item in data.get('items', []):
            yield Document(
                page_content=item.get('content', ''),
                metadata={
                    'source': self.api_url,
                    'id': item.get('id'),
                    'title': item.get('title')
                }
            )

class DatabaseStreamLoader(BaseLoader):
    """流式数据库加载器"""

    def __init__(self, connection_string: str, query: str, batch_size: int = 1000):
        self.connection_string = connection_string
        self.query = query
        self.batch_size = batch_size

    def lazy_load(self) -> Iterator[Document]:
        """分批加载大量数据"""
        from sqlalchemy import create_engine, text

        engine = create_engine(self.connection_string)
        offset = 0

        while True:
            paginated_query = f"{self.query} LIMIT {self.batch_size} OFFSET {offset}"

            with engine.connect() as conn:
                result = conn.execute(text(paginated_query))
                rows = result.fetchall()

                if not rows:
                    break

                for row in rows:
                    yield Document(
                        page_content=str(row[1]),  # 假设第二列是内容
                        metadata={
                            'id': row[0],  # 假设第一列是ID
                            'source': 'database',
                            'batch': offset // self.batch_size
                        }
                    )

                offset += self.batch_size

def custom_loader_examples():
    """自定义加载器使用示例"""

    # 使用自定义API加载器
    api_loader = CustomAPILoader(
        api_url="https://api.example.com/articles",
        headers={"Authorization": "Bearer your-token"}
    )

    try:
        api_docs = api_loader.load()
        print(f"API文档数量: {len(api_docs)}")
    except Exception as e:
        print(f"API加载失败: {e}")

    # 使用流式数据库加载器
    db_stream_loader = DatabaseStreamLoader(
        connection_string="sqlite:///large_db.db",
        query="SELECT id, content FROM large_table",
        batch_size=500
    )

    # 懒加载处理大量数据
    for i, doc in enumerate(db_stream_loader.lazy_load()):
        if i >= 10:  # 只处理前10个文档作为示例
            break
        print(f"文档 {i}: {doc.page_content[:50]}...")

### 10. 完整使用示例

In [None]:
def complete_document_loader_example():
    """完整的文档加载器使用示例"""

    print("🚀 LangChain 0.3 Document Loaders 完整示例")
    print("=" * 60)

    all_documents = []

    # 1. 文本文件
    print("\n📄 加载文本文件...")
    text_docs = text_loader_examples()
    all_documents.extend(text_docs)

    # 2. CSV数据
    print("\n📊 加载CSV数据...")
    csv_docs = csv_loader_examples()
    all_documents.extend(csv_docs)

    # 3. JSON数据
    print("\n🔧 加载JSON数据...")
    json_docs = json_loader_examples()
    all_documents.extend(json_docs)

    # 4. 目录批量加载
    print("\n📁 批量加载目录...")
    dir_docs = directory_loader_examples()
    all_documents.extend(dir_docs)

    # 5. 数据库加载
    print("\n🗄️ 加载数据库...")
    db_docs = database_loader_examples()
    all_documents.extend(db_docs)

    # 6. 自定义加载器
    print("\n⚙️ 自定义加载器...")
    custom_docs = custom_loader_examples()

    # 统计信息
    print(f"\n📈 加载统计:")
    print(f"总文档数量: {len(all_documents)}")

    # 按来源分组
    sources = {}
    for doc in all_documents:
        source = doc.metadata.get('source', 'unknown')
        sources[source] = sources.get(source, 0) + 1

    print("按来源分布:")
    for source, count in sources.items():
        print(f"  {source}: {count} 个文档")

    # 内容长度统计
    lengths = [len(doc.page_content) for doc in all_documents]
    if lengths:
        print(f"内容长度统计:")
        print(f"  平均长度: {sum(lengths) / len(lengths):.0f} 字符")
        print(f"  最短: {min(lengths)} 字符")
        print(f"  最长: {max(lengths)} 字符")

    return all_documents

if __name__ == "__main__":
    documents = complete_document_loader_example()

    # 清理临时文件
    import shutil
    for path in ["sample.txt", "employees.csv", "articles.json", "documents"]:
        if os.path.exists(path):
            if os.path.isdir(path):
                shutil.rmtree(path)
            else:
                os.remove(path)

    print("\n🧹 临时文件已清理")

### 总结
1. LangChain 0.3 的 Document Loaders 提供了丰富的数据源支持：
#### 主要特点：
3. 统一的 Document 接口
4. 丰富的文件格式支持
5. 云存储集成
6. 自定义加载器扩展
7. 批量和流式处理
8. 元数据保留

#### 选择建议：
10. 简单文本：使用 TextLoader
11. PDF文档：推荐 PyPDFLoader
12. 结构化数据：使用 CSVLoader 或 JSONLoader
13. 网页内容：使用 WebBaseLoader
14. 大量文件：使用 DirectoryLoader
15. 云存储：使用对应的云存储加载器
16. 特殊需求：实现自定义加载器