In [5]:
import io
from typing import Iterable, Callable
import zipfile
import traceback
from dataclasses import dataclass

import requests


@dataclass
class RawRepositoryFile:
    filename: str
    content: str
class GithubRepositoryDataReader:
    """
    Downloads and parses markdown and code files from a GitHub repository.
    """

    def __init__(self,
                repo_owner: str,
                repo_name: str,
                allowed_extensions: Iterable[str] | None = None,
                filename_filter: Callable[[str], bool] | None = None
        ):
        """
        Initialize the GitHub repository data reader.
        
        Args:
            repo_owner: The owner/organization of the GitHub repository
            repo_name: The name of the GitHub repository
            allowed_extensions: Optional set of file extensions to include
                    (e.g., {"md", "py"}). If not provided, all file types are included
            filename_filter: Optional callable to filter files by their path
        """
        prefix = "https://codeload.github.com"
        self.url = (
            f"{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main"
        )

        if allowed_extensions is not None:
            self.allowed_extensions = {ext.lower() for ext in allowed_extensions}

        if filename_filter is None:
            self.filename_filter = lambda filepath: True
        else:
            self.filename_filter = filename_filter

    def read(self) -> list[RawRepositoryFile]:
        """
        Download and extract files from the GitHub repository.
        
        Returns:
            List of RawRepositoryFile objects for each processed file
            
        Raises:
            Exception: If the repository download fails
        """
        resp = requests.get(self.url)
        if resp.status_code != 200:
            raise Exception(f"Failed to download repository: {resp.status_code}")

        zf = zipfile.ZipFile(io.BytesIO(resp.content))
        repository_data = self._extract_files(zf)
        zf.close()

        return repository_data
    def _extract_files(self, zf: zipfile.ZipFile) -> list[RawRepositoryFile]:
        """
        Extract and process files from the zip archive.
        
        Args:
            zf: ZipFile object containing the repository data

        Returns:
            List of RawRepositoryFile objects for each processed file
        """
        data = []
        for file_info in zf.infolist():
            filepath = self._normalize_filepath(file_info.filename)

            if self._should_skip_file(filepath):
                continue

            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="ignore")
                    if content is not None:
                        content = content.strip()

                    file = RawRepositoryFile(
                        filename=filepath,
                        content=content
                    )
                    data.append(file)

            except Exception as e:
                print(f"Error processing {file_info.filename}: {e}")
                traceback.print_exc()
                continue

        return data

        
    def _should_skip_file(self, filepath: str) -> bool:
        """
        Determine whether a file should be skipped during processing.
        
        Args:
            filepath: The file path to check
            
        Returns:
            True if the file should be skipped, False otherwise
        """
        filepath = filepath.lower()

        # directory
        if filepath.endswith("/"):
            return True

        # hidden file
        filename = filepath.split("/")[-1]
        if filename.startswith("."):
            return True

        if self.allowed_extensions:
            ext = self._get_extension(filepath)
            if ext not in self.allowed_extensions:
                return True

        if not self.filename_filter(filepath):
            return True

        return False
    def _get_extension(self, filepath: str) -> str:
        """
        Extract the file extension from a filepath.
        
        Args:
            filepath: The file path to extract extension from
            
        Returns:
            The file extension (without dot) or empty string if no extension
        """
        filename = filepath.lower().split("/")[-1]
        if "." in filename:
            return filename.rsplit(".", maxsplit=1)[-1]
        else:
            return ""

    def _normalize_filepath(self, filepath: str) -> str:
        """
        Removes the top-level directory from the file path inside the zip archive.
        'repo-main/path/to/file.py' -> 'path/to/file.py'
        
        Args:
            filepath: The original filepath from the zip archive
            
        Returns:
            The normalized filepath with top-level directory removed
        """
        parts = filepath.split("/", maxsplit=1)
        if len(parts) > 1:
            return parts[1]
        else:
            return parts[0]

In [36]:
def read_github_data():
    repo_owner = 'evidentlyai'
    repo_name = 'docs'

    
    allowed_extensions = {"md", "mdx"}

    reader = GithubRepositoryDataReader(
        repo_owner,
        repo_name,
        allowed_extensions=allowed_extensions,
      
    )
    
    return reader.read()

In [7]:
github_data = read_github_data()

In [8]:
print(github_data[40].content)

---
title: "Evidently and GitHub actions"
description: "Testing LLM outputs as part of the CI/CD flow."
---

You can use Evidently together with GitHub Actions to automatically test the outputs of your LLM agent or application - as part of every code push or pull request.

## How the integration work:

- You define a test dataset of inputs (e.g. test prompts with or without reference answers). You can store it as a file, or save the dataset at Evidently Cloud callable by Dataset ID.
- Run your LLM system or agent against those inputs inside CI.
- Evidently automatically evaluates the outputs using the user-specified config (which defines the Evidently descriptors, tests and Report composition), including methods like:
  - LLM judges (e.g., tone, helpfulness, correctness)
  - Custom Python functions
  - Dataset-level metrics like classification quality
- If any test fails, the CI job fails.
- You get a detailed test report with pass/fail status and metrics.

![](/images/examples/github_

In [9]:
!uv add python-frontmatter

[2K[2mResolved [1m122 packages[0m [2min 2.36s[0m[0m                                       [0m
[2K[2mPrepared [1m1 package[0m [2min 219ms[0m[0m                                              
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m1 package[0m [2min 1ms[0m[0m=1.1.0                            [0m
 [32m+[39m [1mpython-frontmatter[0m[2m==1.1.0[0m


In [10]:
import frontmatter

def parse_data(data_raw):
    data_parsed = []
    for f in data_raw:
        post = frontmatter.loads(f.content)
        data = post.to_dict()
        data['filename'] = f.filename
        data_parsed.append(data)

    return data_parsed

In [11]:
parsed_data = parse_data(github_data)

In [12]:
parsed_data[:10]

[{'title': 'Create Plant',
  'openapi': 'POST /plants',
  'content': '',
  'filename': 'api-reference/endpoint/create.mdx'},
 {'title': 'Delete Plant',
  'openapi': 'DELETE /plants/{id}',
  'content': '',
  'filename': 'api-reference/endpoint/delete.mdx'},
 {'title': 'Get Plants',
  'openapi': 'GET /plants',
  'content': '',
  'filename': 'api-reference/endpoint/get.mdx'},
 {'title': 'Introduction',
  'description': 'Example section for showcasing API endpoints',
  'content': '<Note>\n  If you\'re not looking to build API reference documentation, you can delete\n  this section by removing the api-reference folder.\n</Note>\n\n## Welcome\n\nThere are two ways to build API documentation: [OpenAPI](https://mintlify.com/docs/api-playground/openapi/setup) and [MDX components](https://mintlify.com/docs/api-playground/mdx/configuration). For the starter kit, we are using the following OpenAPI specification.\n\n<Card\n  title="Plant Store Endpoints"\n  icon="leaf"\n  href="https://github.com/m

In [17]:
parsed_data[3]['content']

'<Note>\n  If you\'re not looking to build API reference documentation, you can delete\n  this section by removing the api-reference folder.\n</Note>\n\n## Welcome\n\nThere are two ways to build API documentation: [OpenAPI](https://mintlify.com/docs/api-playground/openapi/setup) and [MDX components](https://mintlify.com/docs/api-playground/mdx/configuration). For the starter kit, we are using the following OpenAPI specification.\n\n<Card\n  title="Plant Store Endpoints"\n  icon="leaf"\n  href="https://github.com/mintlify/starter/blob/main/api-reference/openapi.json"\n>\n  View the OpenAPI specification file\n</Card>\n\n## Authentication\n\nAll API endpoints are authenticated using Bearer tokens and picked up from the specification file.\n\n```json\n"security": [\n  {\n    "bearerAuth": []\n  }\n]\n```'

In [18]:
from typing import Any, Dict, Iterable, List


def sliding_window(
        seq: Iterable[Any],
        size: int,
        step: int
    ) -> List[Dict[str, Any]]:
    """
    Create overlapping chunks from a sequence using a sliding window approach.

    Args:
        seq: The input sequence (string or list) to be chunked.
        size (int): The size of each chunk/window.
        step (int): The step size between consecutive windows.

    Returns:
        list: A list of dictionaries, each containing:
            - 'start': The starting position of the chunk in the original sequence
            - 'content': The chunk content

    Raises:
        ValueError: If size or step are not positive integers.

    Example:
        >>> sliding_window("hello world", size=5, step=3)
        [{'start': 0, 'content': 'hello'}, {'start': 3, 'content': 'lo wo'}]
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append({'start': i, 'content': batch})
        if i + size > n:
            break

    return result

In [19]:
def chunk_documents(
        documents: Iterable[Dict[str, str]],
        size: int = 2000,
        step: int = 1000,
        content_field_name: str = 'content'
) -> List[Dict[str, str]]:
    """
    Split a collection of documents into smaller chunks using sliding windows.

    Takes documents and breaks their content into overlapping chunks while preserving
    all other document metadata (filename, etc.) in each chunk.

    Args:
        documents: An iterable of document dictionaries. Each document must have a content field.
        size (int, optional): The maximum size of each chunk. Defaults to 2000.
        step (int, optional): The step size between chunks. Defaults to 1000.
        content_field_name (str, optional): The name of the field containing document content.
                                          Defaults to 'content'.

    Returns:
        list: A list of chunk dictionaries. Each chunk contains:
            - All original document fields except the content field
            - 'start': Starting position of the chunk in original content
            - 'content': The chunk content
    Example:
        >>> documents = [{'content': 'long text...', 'filename': 'doc.txt'}]
        >>> chunks = chunk_documents(documents, size=100, step=50)
        >>> # Or with custom content field:
        >>> documents = [{'text': 'long text...', 'filename': 'doc.txt'}]
        >>> chunks = chunk_documents(documents, content_field_name='text')
    """
    results = []

    for doc in documents:
        doc_copy = doc.copy()
        doc_content = doc_copy.pop(content_field_name)
        chunks = sliding_window(doc_content, size=size, step=step)
        for chunk in chunks:
            chunk.update(doc_copy)
        results.extend(chunks)

    return results


In [20]:
chunks = chunk_documents(parsed_data)

In [22]:
chunks[3]

{'start': 2000,
 'content': 'eleases/tag/v0.7.6).\n</Update>\n\n<Update label="2025-05-09" description="Evidently v0.7.5">\n  ## **Evidently 0.7.5**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.5).\n</Update>\n\n<Update label="2025-05-05" description="Evidently v0.7.4">\n  ## **Evidently 0.7.4**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.4).\n</Update>\n\n<Update label="2025-04-25" description="Evidently v0.7.3">\n  ## **Evidently 0.7.3**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.3).\n</Update>\n\n<Update label="2025-04-22" description="Evidently v0.7.2">\n  ## **Evidently 0.7.2**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.2).\n</Update>\n\n<Update label="2025-04-21" description="Evidently v0.7.1">\n  ## **Evidently 0.7.1**\n\n  Full release notes on [Github](https://github.com/evid

In [23]:
from minsearch import Index

In [24]:
index = Index(
    text_fields=["content", "filename", "title", "description"],
)

index.fit(chunks)

<minsearch.minsearch.Index at 0x78524c9e52e0>

In [25]:
search_results = index.search('how do I use llm-as-a-judge for evals')

In [26]:
def search(query):
    return index.search(
        query=query,
        num_results=15
    )

In [27]:
question = 'how do I use llm-as-a-judge for evals'

In [28]:
instructions = """
You're an assistant that helps with the documentation.
Answer the QUESTION based on the CONTEXT from the search engine of our documentation.

Use only the facts from the CONTEXT when answering the QUESTION.

When answering the question, provide the reference to the file with the source.
Use the filename field for that. The repo url is: https://github.com/evidentlyai/docs/
Include code examples when relevant. 
If the question is discussed in multiple documents, cite all of them.

Don't use markdown or any formatting in the output.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [29]:
import json

def build_prompt(question, search_results):
    context = json.dumps(search_results)

    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    
    return prompt

In [30]:
from openai import OpenAI

openai_client = OpenAI()

def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [31]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt)
    return response

In [32]:
result = rag('How can I build an eval report with llm as a judge?')

In [33]:
print(result)

To build an evaluation report with an LLM (Large Language Model) as a judge, you can follow these steps:

### 1. **Installation and Imports**
First, ensure you have the `evidently` library installed:

```bash
pip install evidently
```

Then, import the necessary modules:

```python
import pandas as pd
import numpy as np

from evidently import Dataset, DataDefinition, Report, BinaryClassification
from evidently.presets import TextEvals
from evidently.metrics import *
from evidently.llm.templates import BinaryClassificationPromptTemplate
```

### 2. **Set Up Your OpenAI Key**
Before using the LLM, set your OpenAI API key in your environment:

```python
import os
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
```

### 3. **Create the Evaluation Dataset**
Create a dataset with your specific evaluations, including questions, approved answers, new responses, and manual labels:

```python
data = [
    ["Question 1?", "Approved Answer 1", "New Response 1", "incorrect", "Reason 1"],
    ["Question 2