# Testing LangChain and LangSmith on LLM Models

This notebook documents the process of integrating and testing LangChain and LangSmith with large language models (LLMs). We demonstrate how to load the necessary environment configurations, connect with APIs, and perform model interactions while tracing execution using LangSmith. The workflow involves setting up API keys, configuring environment variables, and running test cases to ensure end-to-end functionality with our LLM models.

## Load environment variables

In [None]:
import getpass
import os

try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

# Set up LangSmith environment variables if not already set
os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch, time, gc

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

max_memory = {0: "5GiB", "cpu": "25GiB"}

tok_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model_id = tok_id

tokenizer = AutoTokenizer.from_pretrained(tok_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    low_cpu_mem_usage=True,
    attn_implementation="sdpa",
    quantization_config=bnb_config,
    max_memory=max_memory,
    device_map="auto",
)
model.eval()

# Safety: some Qwen-family tokenizers miss pad id; generation needs it
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
from typing import List, Dict, Any, Iterator
from langchain_core.runnables import Runnable, RunnableLambda, RunnableGenerator
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langsmith import traceable, get_current_run_tree


def _apply_chat_template(messages: List[Dict[str, str]], max_prompt_tokens: int = 512):
    """Your exact formatting path: messages -> chat template -> tokenized tensors (on correct device)."""
    prompt_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_prompt_tokens,
    )
    device = next(model.parameters()).device
    return {k: v.to(device) for k, v in prompt_inputs.items()}


def _decode_new_tokens(generated_ids, input_len):
    return tokenizer.decode(generated_ids[0][input_len:], skip_special_tokens=False)


@traceable(name="deepseek_r1_generate")  # LangSmith span
def _generate_once(
    messages: List[Dict[str, str]],
    max_new_tokens: int = 128,
    temperature: float = 0.2,
    top_p: float = 0.95,
    **gen_kwargs,
) -> str:
    """Single-shot generate with your timing + CUDA sync, traced in LangSmith."""
    inputs = _apply_chat_template(messages)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t0 = time.perf_counter()

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=(temperature > 0),
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.pad_token_id,
        **gen_kwargs,
    )

    if torch.cuda.is_available():
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    # Collect basic perf stats
    gen_len = int(outputs.shape[-1] - inputs["input_ids"].shape[-1])
    toks_per_s = gen_len / dt if dt > 0 else float("inf")
    vram_alloc = (
        torch.cuda.memory_allocated() / 1e6 if torch.cuda.is_available() else 0.0
    )
    vram_reserved = (
        torch.cuda.memory_reserved() / 1e6 if torch.cuda.is_available() else 0.0
    )

    # `traceable` automatically creates the span; we can add metadata via return value
    text = _decode_new_tokens(outputs, inputs["input_ids"].shape[-1])

    # Add metadata directly to this run
    rt = get_current_run_tree()
    if rt:
        rt.metadata.update(
            {
                "model": "DeepSeek-R1-Distill-Qwen-7B",
                "quant": "4bit-nf4",
                "device": str(next(model.parameters()).device),
            }
        )
        rt.set(
            usage_metadata={
                "output_tokens": outputs.shape[-1] - inputs["input_ids"].shape[-1]
            }
        )

    return {
        "text": text,
        "metrics": {
            "latency_s": round(dt, 4),
            "gen_tokens": gen_len,
            "tokens_per_s": round(toks_per_s, 2),
            "vram_alloc_mb": round(vram_alloc, 1),
            "vram_reserved_mb": round(vram_reserved, 1),
        },
        "gen_params": {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_p": top_p,
            **{
                k: v
                for k, v in gen_kwargs.items()
                if k in ("repetition_penalty", "top_k")
            },
        },
        "device_map": str(getattr(model, "hf_device_map", "unknown")),
        "dtype": str(getattr(model.config, "torch_dtype", "auto")),
    }


# LangChain Runnable (non-streaming)
lc_generate: Runnable[List[Dict[str, str]], Dict[str, Any]] = RunnableLambda(
    _generate_once
)


# Streaming variant using Transformers’ TextStreamer-like incremental decode
def _stream_tokens(
    messages: List[Dict[str, str]],
    max_new_tokens: int = 128,
    temperature: float = 0.2,
    top_p: float = 0.95,
    **gen_kwargs,
) -> Iterator[str]:
    inputs = _apply_chat_template(messages)
    input_len = inputs["input_ids"].shape[-1]

    # Greedy-ish incremental loop using `model.generate` streaming via `stopping_criteria` is messy;
    # simplest is `generate` then chunk decode. For true token-level streaming, prefer HF TextStreamer.
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=(temperature > 0),
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.pad_token_id,
        **gen_kwargs,
    )
    decoded = _decode_new_tokens(outputs, input_len)
    # yield in chunks so LangChain callbacks still get pieces
    chunk_sz = 40
    for i in range(0, len(decoded), chunk_sz):
        yield decoded[i : i + chunk_sz]


lc_stream: Runnable[List[Dict[str, str]], Iterator[str]] = RunnableGenerator(
    _stream_tokens
)

In [10]:
# Your original messages
messages = [
    {
        "role": "user",
        "content": "Imagine you are asked to consider the following question: What are the key differences between supervised and unsupervised learning in machine learning? Please provide a detailed explanation.",
    }
]

# Single response
result = lc_generate.invoke(messages)
print(result["text"])
print(result["metrics"])  # latency, toks/s, VRAM, etc.

# Streaming response
for chunk in lc_stream.stream(messages):
    print(chunk, end="", flush=True)

Okay, so I need to figure out the key differences between supervised and unsupervised learning in machine learning. Hmm, I remember from my studies that these are two main types of machine learning, but I'm a bit fuzzy on the details. Let me try to break it down.

First, I think supervised learning involves using labeled data. That means the data has the correct answers or outputs already provided. So, the model is trained on this data to predict the outcomes. For example, if I'm trying to predict whether an email is spam or not, I would have a dataset where each email is labeled as spam or not spam.
{'latency_s': 11.0349, 'gen_tokens': 128, 'tokens_per_s': 11.6, 'vram_alloc_mb': 5588.1, 'vram_reserved_mb': 5943.3}
Okay, so I need to figure out how to solve this problem. Hmm, let me start by understanding what the problem is asking. It says, "Solve the equation 2x + 3 = 7." Alright, that seems straightforward. I've done similar problems before, but I want to make sure I approach it cor

In [11]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [("system", "You are a precise reasoning assistant."), ("human", "{question}")]
)


# Turn LC messages -> your HF messages dict format
def to_hf_messages(lc_messages) -> List[Dict[str, str]]:
    out = []
    for m in lc_messages:
        if m.type == "human":
            out.append({"role": "user", "content": m.content})
        elif m.type == "system":
            out.append({"role": "system", "content": m.content})
        elif m.type == "ai":
            out.append({"role": "assistant", "content": m.content})
    return out


chain = (
    prompt
    | (lambda vars: to_hf_messages(vars.to_messages()))  # LCEL lambda
    | lc_generate
)

res = chain.invoke({"question": "Summarize the Chinese Room critique in 6 lines."})
print(res["text"])

Okay, so I need to summarize the Chinese Room critique in six lines. Hmm, I remember the Chinese Room is a thought experiment by John Searle, right? It's about whether a machine can truly understand or have consciousness. Let me think about how it works.

So, the setup is a room with a set of physical symbols, like cards, and a person who follows rules to manipulate these symbols. The person doesn't understand the meaning behind the symbols, they just do the tasks. From the outside, it looks like the machine is understanding and processing information, but really, it's just following procedures.

The critique, I


In [13]:
def prompt_token_count(messages):
    enc = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
        return_dict=True,  # <— important
        truncation=True,
        max_length=512,
    )
    ids = enc["input_ids"]
    # handle (seq,) or (1, seq)
    if ids.dim() == 1:
        return int(ids.size(0))
    elif ids.dim() == 2:
        return int(ids.size(1))
    else:
        return int(ids.numel())


prompt_messages = messages  # from above
result = lc_generate.invoke(
    prompt_messages,
    config={
        "metadata": {
            "model": model_id,
            "quant": "4bit-nf4",
            "attn": "sdpa",
            "device": str(next(model.parameters()).device),
            "max_memory": {str(k): v for k, v in max_memory.items()},  # JSON-safe
            "prompt_tokens": prompt_token_count(prompt_messages),
        },
        "tags": ["exp", "deepseek-r1", "local-hf"],
    },
)

In [14]:
res = lc_generate.invoke(messages)
print(res["text"])
print("Perf:", res["metrics"])

# Your existing cleanup
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
del res
gc.collect()

Okay, so I need to figure out the key differences between supervised and unsupervised learning. Hmm, I remember from my studies that both are types of machine learning, but I'm a bit fuzzy on the exact distinctions. Let me try to break this down.

First, I think supervised learning involves using labeled data. That means the data has known outcomes or answers. Like, if I'm trying to predict whether an email is spam or not, each email would have a label: spam or not spam. The model learns from these labeled examples to make predictions on new, unseen data. So, the key here is that the data has
Perf: {'latency_s': 11.2741, 'gen_tokens': 128, 'tokens_per_s': 11.35, 'vram_alloc_mb': 5588.1, 'vram_reserved_mb': 5943.3}


3237