In [14]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    
    return repository_data

In [15]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

In [16]:
print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")

# print(repository_data[0].keys())     # see what fields each entry has
# print(repository_data[0]['filename'])
# print(repository_data[0]['content'][:300])  # preview first 300 characters


FAQ documents: 1224
Evidently documents: 95


Note: This code may not work perfectly if we want to split by level 1 headings and have Python code with # comments. But in general, this is not a big problem for documentation.

If we want to split by second-level headers, that's what we do:


# üß† Day 2 ‚Äì Chunking and Intelligent Processing for Data

In this notebook, you‚Äôll learn to prepare long documents for AI systems by *chunking* them ‚Äî breaking large text into smaller, meaningful pieces.  
We‚Äôll implement and compare two practical methods:
- **Sliding-window chunking** (for unstructured text)  
- **Section-based splitting** (for markdown documents with headings)  
and then build a **hybrid function** that picks the right one automatically.

## üß© 1Ô∏è‚É£ Setup Environment


In [39]:
from dotenv import load_dotenv
load_dotenv()   # loads OPENAI_API_KEY and other vars from .env (safe, local)

import os, re, json
from tqdm.auto import tqdm


## üì• 2Ô∏è‚É£ Load or Reuse Your Data

If you followed Day 1, you already have your repository data stored in a list such as `evidently_docs` or `repository_data`.  
Let‚Äôs assume you‚Äôre continuing from Day 1:


In [40]:
# Example placeholder if you don't have Day 1 loaded in this notebook:
# from day_01 import read_repo_data
# evidently_docs = read_repo_data("DataTalksClub", "faq")

# If you already have the data loaded from previous steps:
print(f"Number of docs: {len(evidently_docs)}")
print(evidently_docs[0].keys())


Number of docs: 95
dict_keys(['title', 'openapi', 'content', 'filename'])


## ü™ü 3Ô∏è‚É£ Simple Sliding-Window Chunking

Use this when the text has no headings and you simply want evenly sized overlapping chunks.


In [41]:
def sliding_window(seq, size=2000, step=1000):
    """Split long text into overlapping windows."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i + size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break
    return result


In [42]:
# Quick test
text = "0123456789" * 20
chunks = sliding_window(text, size=20, step=10)
print(f"Chunks created: {len(chunks)}")
print(chunks[:2])


Chunks created: 19
[{'start': 0, 'chunk': '01234567890123456789'}, {'start': 10, 'chunk': '01234567890123456789'}]


## ü™∂ 4Ô∏è‚É£ Section-Based Splitting (for Markdown)

Use this when your files contain headings such as `## Section Title`.
