In [1]:
!pip install -q langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.6/990.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.0/384.0 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.2/140.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import os
import requests
from abc import ABC
from urllib.parse import quote
from typing import Callable, Dict, Iterator, List, Optional

from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
from langchain_core.utils import get_from_dict_or_env

In [27]:
class BaseGitLabLoader(BaseLoader, BaseModel, ABC):
    project_id: str
    access_token: str
    gitlab_api_url: str = "https://gitlab.com/api/v4"

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        values["access_token"] = get_from_dict_or_env(values, "access_token", "GITLAB_PERSONAL_ACCESS_TOKEN")
        return values

    @property
    def headers(self) -> Dict[str, str]:
        return {"PRIVATE-TOKEN": self.access_token}

In [30]:
class GitLabFileLoader(BaseGitLabLoader):
    branch: str = "master"
    file_filter: Optional[Callable[[str], bool]] = None

    def fetch_files_recursive(self, path="") -> List[Dict]:
        """Fetch files recursively from a given directory."""
        encoded_project_id = quote(self.project_id, safe='')
        base_url = f"{self.gitlab_api_url}/projects/{encoded_project_id}/repository/tree?ref={self.branch}&path={quote(path)}&recursive=false"
        print(f"Fetching from: {base_url}")
        response = requests.get(base_url, headers=self.headers)
        response.raise_for_status()
        items = response.json()

        files = []
        for item in items:
            if item['type'] == 'tree':  # Directory found, recurse into it
                files.extend(self.fetch_files_recursive(item['path']))
            elif item['type'] == 'blob':
                files.append(item)
        return files

    def get_file_paths(self) -> List[Dict]:
        all_files = self.fetch_files_recursive()
        print(f"Files fetched: {all_files}")
        return [file for file in all_files if not (self.file_filter and not self.file_filter(file["path"]))]

    def get_file_content_by_path(self, path: str) -> str:
        encoded_project_id = quote(self.project_id, safe='')
        encoded_path = quote(path, safe='')
        base_url = f"{self.gitlab_api_url}/projects/{encoded_project_id}/repository/files/{encoded_path}/raw?ref={self.branch}"
        print(f"Fetching content from: {base_url}")
        response = requests.get(base_url, headers=self.headers)
        response.raise_for_status()
        return response.text

    def lazy_load(self) -> Iterator[Document]:
        files = self.get_file_paths()
        for file in files:
            content = self.get_file_content_by_path(file["path"])
            if content == "":
                continue

            metadata = {
                "path": file["path"],
                "id": file.get("id"),
                "source": f"{self.gitlab_api_url}/projects/{quote(self.project_id, safe='')}/repository/files/{quote(file['path'], safe='')}/raw?ref={self.branch}"
            }
            print(f"File Loaded: {metadata['path']}")
            yield Document(page_content=content, metadata=metadata)





In [33]:
def py_file_filter(file_path: str) -> bool:
    return file_path.endswith('.py')

project_id = '60281633'  # Use URL-encoded form of the project path
access_token = 'private-token'  # Replace with your GitLab personal access token

loader = GitLabFileLoader(project_id=project_id, access_token=access_token, file_filter=py_file_filter, branch='development')

documents = loader.load()
for doc in documents:
    print("---")
    print(f"Document Path: {doc.metadata['path']}")
    print(f"Document Content (first 100 chars): {doc.page_content[:1000]}")


Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=.github&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=.github/workflows&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=backend&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=backend/.github&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=backend/.github/workflows&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=backend/.vscode&recursive=false
Fetching from: https://gitlab.com/api/v4/projects/60281633/repository/tree?ref=development&path=backend/alembic&recursive=f