In [1]:


from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

# Use the Gmail pytest suite from docs explicitly
py_file = Path("./docs/gmail_pytest_suite.py")
source = py_file.read_text(encoding="utf-8")


# Configure a language-aware splitter for Python code
splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=500,
    chunk_overlap=50,
)

# Split into overlapping chunks (character-based, language-aware)
lan_chunks = splitter.split_text(source)

print(f"🧩 Chunks created: {len(lan_chunks)}")
print(lan_chunks[1])

language_documents=lan_chunks


🧩 Chunks created: 150
def test_login_unregistered_email_002(requests_mock):
    """unregistered email rejected"""
    url = "https://accounts.google.com/v3/signin/verify"
    payload = {"email": "nouser@example.com", "password": "any_pw"}
    requests_mock.post(url, status_code=404)
    import requests
    resp = requests.post(url, json=payload)
    assert resp.status_code == 404


In [2]:


from langchain_text_splitters import TokenTextSplitter
import tiktoken

# Use the Gmail pytest suite from docs explicitly
py_file = Path("./docs/gmail_pytest_suite.py")

source = py_file.read_text(encoding="utf-8")
print(f"Characters in file: {len(source)}")

# Initialize tiktoken encoder (used for tokenization)
enc = tiktoken.get_encoding("cl100k_base")
source_tokens = enc.encode(source)

# Configure a token-aware splitter using tiktoken encoding
splitter = TokenTextSplitter(
    encoding_name="cl100k_base",  # tiktoken encoding
    chunk_size=100,                # in tokens
    chunk_overlap=20               # in tokens
)

# Split into overlapping token chunks
tik_chunks = splitter.split_text(source)
chunk_token_counts = [len(enc.encode(c)) for c in tik_chunks]

print(f"🧩 Chunks created: {len(tik_chunks)}")
print(tik_chunks[0])

# Expose for downstream steps
token_documents = tik_chunks

Characters in file: 52648
🧩 Chunks created: 161
# Gmail Login – 150 PyTest Cases

def test_login_valid_001(requests_mock):
    """valid login redirects to inbox"""
    url = "https://accounts.google.com/v3/signin/verify"
    payload = {"email": "user@example.com", "password": "correct_pw"}
    requests_mock.post(url, status_code=200)
    import requests
    resp = requests.post(url, json=payload)
    assert resp.status_code == 200


def test_login


## Embedding-based SemanticChunker (LangChain Experimental)

Now we use the embedding-driven SemanticChunker from langchain_experimental with HuggingFace embeddings and compare results against earlier strategies.

In [3]:
%pip install -q langchain langchain-huggingface langchain-experimental sentence-transformers transformers torch


Note: you may need to restart the kernel to use updated packages.


In [4]:


from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the semantic chunker (tune params as needed)
sem_chunker = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="absolute",    
    breakpoint_threshold_amount=0.3,         # sensitivity for breakpoints
)

# Produce semantic chunks from `source`
sem_chunks = sem_chunker.split_text(source)


print(f"Semantic chunks: {len(sem_chunks)}")
print(f"First chunk preview:\n{sem_chunks[0][:1000]}")


# Expose for downstream steps
semantic_documents = sem_chunks


  from .autonotebook import tqdm as notebook_tqdm


Semantic chunks: 1
First chunk preview:
# Gmail Login – 150 PyTest Cases

def test_login_valid_001(requests_mock):
    """valid login redirects to inbox"""
    url = "https://accounts.google.com/v3/signin/verify"
    payload = {"email": "user@example.com", "password": "correct_pw"}
    requests_mock.post(url, status_code=200)
    import requests
    resp = requests.post(url, json=payload)
    assert resp.status_code == 200


def test_login_unregistered_email_002(requests_mock):
    """unregistered email rejected"""
    url = "https://accounts.google.com/v3/signin/verify"
    payload = {"email": "nouser@example.com", "password": "any_pw"}
    requests_mock.post(url, status_code=404)
    import requests
    resp = requests.post(url, json=payload)
    assert resp.status_code == 404


def test_login_valid_003(requests_mock):
    """valid login redirects to inbox"""
    url = "https://accounts.google.com/v3/signin/verify"
    payload = {"email": "user@example.com", "password": "correct_pw"}

### Why Semantic Chunking Produces Only One Chunk

The pytest test cases have very similar semantic content - they all follow the same pattern (mock setup, API call, assertion). Since the embeddings are nearly identical for adjacent test functions, SemanticChunker doesn't detect meaningful breakpoints and treats the entire content as one semantically coherent chunk. For test code, syntactic chunking (by function boundaries) is more effective than semantic chunking.