forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
273 additions
and
0 deletions.
There are no files selected for viewing
199 changes: 199 additions & 0 deletions
199
docs/modules/indexes/document_loaders/examples/git.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Git\n", | ||
"\n", | ||
"This notebook shows how to load text files from Git repository." | ||
] | ||
}, | ||
{ | ||
"attachments": {}, | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Load existing repository from disk" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from git import Repo\n", | ||
"\n", | ||
"repo = Repo.clone_from(\n", | ||
" \"https://github.com/hwchase17/langchain\", to_path=\"./example_data/test_repo1\"\n", | ||
")\n", | ||
"branch = repo.head.reference" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders.git import GitLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "TypeError", | ||
"evalue": "__init__() got an unexpected keyword argument 'path'", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loader \u001b[39m=\u001b[39m GitLoader(path\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m./example_data/test_repo1/\u001b[39;49m\u001b[39m\"\u001b[39;49m, branch\u001b[39m=\u001b[39;49mbranch)\n", | ||
"\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'path'" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"loader = GitLoader(repo_path=\"./example_data/test_repo1/\", branch=branch)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"1040" | ||
] | ||
}, | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"page_content='.venv\\n.github\\n.git\\n.mypy_cache\\n.pytest_cache\\nDockerfile' metadata={'file_path': '.dockerignore', 'file_name': '.dockerignore', 'file_type': ''}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(data[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Clone repository from url" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders.git import GitLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = GitLoader(\n", | ||
" clone_url=\"https://github.com/hwchase17/langchain\",\n", | ||
" repo_path=\"./example_data/test_repo2/\",\n", | ||
" branch=\"master\",\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"1040" | ||
] | ||
}, | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "ai", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.6" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import os | ||
from typing import List, Optional | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class GitLoader(BaseLoader): | ||
"""Loads files from a Git repository into a list of documents. | ||
Repository can be local on disk available at `repo_path`, | ||
or remote at `clone_url` that will be cloned to `repo_path`. | ||
Currently supports only text files. | ||
Each document represents one file in the repository. The `path` points to | ||
the local Git repository, and the `branch` specifies the branch to load | ||
files from. By default, it loads from the `main` branch. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
repo_path: str, | ||
clone_url: Optional[str] = None, | ||
branch: Optional[str] = "main", | ||
): | ||
self.repo_path = repo_path | ||
self.clone_url = clone_url | ||
self.branch = branch | ||
|
||
def load(self) -> List[Document]: | ||
try: | ||
from git import Blob, Repo | ||
except ImportError as ex: | ||
raise ImportError( | ||
"Could not import git python package. " | ||
"Please install it with `pip install GitPython`." | ||
) from ex | ||
|
||
if not os.path.exists(self.repo_path) and self.clone_url is None: | ||
raise ValueError(f"Path {self.repo_path} does not exist") | ||
elif self.clone_url: | ||
repo = Repo.clone_from(self.clone_url, self.repo_path) | ||
repo.git.checkout(self.branch) | ||
else: | ||
repo = Repo(self.repo_path) | ||
repo.git.checkout(self.branch) | ||
|
||
docs: List[Document] = [] | ||
|
||
for item in repo.tree().traverse(): | ||
if isinstance(item, Blob): | ||
file_path = os.path.join(self.repo_path, item.path) | ||
rel_file_path = os.path.relpath(file_path, self.repo_path) | ||
try: | ||
with open(file_path, "rb") as f: | ||
content = f.read() | ||
file_type = os.path.splitext(item.name)[1] | ||
|
||
# loads only text files | ||
try: | ||
text_content = content.decode("utf-8") | ||
except UnicodeDecodeError: | ||
continue | ||
|
||
metadata = { | ||
"file_path": rel_file_path, | ||
"file_name": item.name, | ||
"file_type": file_type, | ||
} | ||
doc = Document(page_content=text_content, metadata=metadata) | ||
docs.append(doc) | ||
except Exception as e: | ||
print(f"Error reading file {file_path}: {e}") | ||
|
||
return docs |