# 07. Software Development and Data Analysis Agents

# 安装依赖

In [None]:
%uv pip install langchain~=0.3 langchain-core~=0.3 langchain-community~=0.3 langchain-openai~=0.3 langgraph~=0.6

In [None]:
%uv pip install langchain-anthropic~=0.3

In [None]:
%uv pip install python-dotenv~=1.1

工具类

In [None]:
import os

import dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic


class Config:
    def __init__(self):
        # By default, load_dotenv doesn't override existing environment variables and looks for a .env file in same directory as python script or searches for it incrementally higher up.
        dotenv_path = dotenv.find_dotenv(usecwd=True)
        if not dotenv_path:
            raise ValueError("No .env file found")
        dotenv.load_dotenv(dotenv_path=dotenv_path)

        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY is not set")

        base_url = os.getenv("OPENAI_API_BASE_URL")
        if not base_url:
            raise ValueError("OPENAI_API_BASE_URL is not set")

        model = os.getenv("OPENAI_MODEL")
        if not model:
            raise ValueError("OPENAI_MODEL is not set")

        coder_model = os.getenv("OPENAI_CODER_MODEL")
        if not coder_model:
            raise ValueError("OPENAI_MODEL is not set")

        self.api_key = api_key
        self.base_url = base_url
        self.model = model
        self.coder_model = coder_model

        self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
        self.anthropic_base_url = os.getenv("ANTHROPIC_BASE_URL")
        self.anthropic_model = os.getenv("ANTHROPIC_MODEL")

        self.embeddings_model = os.getenv("OPENAI_EMBEDDINGS_MODEL")

        hf_pretrained_embeddings_model = os.getenv("HF_PRETRAINED_EMBEDDINGS_MODEL")
        self.hf_pretrained_embeddings_model = (
            hf_pretrained_embeddings_model
            if hf_pretrained_embeddings_model
            else "Qwen/Qwen3-Embedding-8B"
        )

    def new_anthropic(self, **kwargs) -> ChatAnthropic:
        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatAnthropic(
            api_key=self.anthropic_api_key,
            base_url=self.anthropic_base_url,
            model=self.anthropic_model,
            **kwargs
        )

    def new_openai_like_coder(self, **kwargs) -> ChatOpenAI:
        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key,
            base_url=self.base_url,
            model=self.coder_model,
            **kwargs
        )

    def new_openai_like(self, **kwargs) -> ChatOpenAI:
        # 参考：https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=2587654
        # 参考：https://help.aliyun.com/zh/model-studio/models
        # ChatOpenAI 文档参考：https://python.langchain.com/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html#langchain_openai.chat_models.base.ChatOpenAI
        return ChatOpenAI(
            api_key=self.api_key, base_url=self.base_url, model=self.model, **kwargs
        )

    def new_openai_like_embeddings(self, **kwargs) -> OpenAIEmbeddings:
        if not self.embeddings_model:
            raise ValueError("OPENAI_EMBEDDINGS_MODEL is not set")

        # 参考：https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings
        return OpenAIEmbeddings(
            api_key=self.api_key,
            base_url=self.base_url,
            model=self.embeddings_model,
            # https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings.tiktoken_enabled
            # 对于非 OpenAI 的官方实现，将这个参数置为 False。
            # 回退到用 huggingface transformers 库 AutoTokenizer 来处理 token。
            tiktoken_enabled=False,
            # https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings.model
            # 元宝说 Jina 的 embedding 模型 https://huggingface.co/jinaai/jina-embeddings-v4 最接近
            # text-embedding-ada-002
            # 个人喜好，选了 Qwen/Qwen3-Embedding-8B
            # tiktoken_model_name='Qwen/Qwen3-Embedding-8B',
            tiktoken_model_name=self.hf_pretrained_embeddings_model,
            **kwargs
        )


def must_get_hfh_api_token() -> str:
    """
    从环境变量或者 .env 环境获取 HuggingFaceHub 的 API 令牌
    """
    # By default, load_dotenv doesn't override existing environment variables and looks for a .env file in same directory as python script or searches for it incrementally higher up.
    dotenv_path = dotenv.find_dotenv(usecwd=True)
    if not dotenv_path:
        raise ValueError("No .env file found")
    dotenv.load_dotenv(dotenv_path=dotenv_path)

    out = os.getenv("HUGGINGFACEHUB_API_TOKEN")
    if not out:
        raise ValueError("HUGGINGFACEHUB_API_TOKEN is not set")

    return out

## LLMs in software development
### The future of development
### Implementation considerations
### Evolution of code LLMs
### Benchmarks for code LLMs
### LLM-based software engineering approaches
### Security and risk mitigation
### Validation framework for LLM-generated code
### LangChain integrations
## Writing code with LLMs
### Google generative AI
replaced by Qwen as

In [None]:
question = """
Given an integer n, return a string array answer (1-indexed) where:

answer[i] == "FizzBuzz" if i is divisible by 3 and 5.
answer[i] == "Fizz" if i is divisible by 3.
answer[i] == "Buzz" if i is divisible by 5.
answer[i] == i (as a string) if none of the above conditions are true.
"""

llm = Config().new_openai_like()

# 调用 print 格式化输出
print(llm.invoke(question).content)

### Hugging Face [没跑通]

In [None]:
# Define the input text
text = """
def calculate_primes(n):
    \"\"\"Create a list of consecutive integers from 2 up to N.

    For example:
    >>> calculate_primes(20)
    Output: [2, 3, 5, 7, 11, 13, 17, 19]
    \"\"\"
"""

In [None]:
%uv pip install huggingface-hub~=0.35

In [None]:
from langchain.llms import HuggingFaceHub

# Choose a lightweight model good for code generation
repo_id = "bigcode/starcoder"

# Initialize the HuggingFaceHub LLM
llm = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={"temperature": 0.5, "max_length": 1000},
    huggingfacehub_api_token=must_get_hfh_api_token(),
)

# Use the LangChain LLM to generate text
output = llm.invoke(text)
print(output)

In [None]:
%uv pip install langchain-huggingface~=0.3

In [None]:
from langchain_huggingface import HuggingFaceEndpoint

# Choose a lightweight model good for code generation
repo_id = "bigcode/starcoder"

# Initialize the HuggingFaceHub LLM
# llm = HuggingFaceHub(
#     repo_id=repo_id,
#     task="text-generation",
#     model_kwargs={"temperature": 0.5, "max_length": 1000},
#     huggingfacehub_api_token=must_get_hfh_api_token(),
# )

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    # max_length=128,
    temperature=0.5,
    huggingfacehub_api_token=must_get_hfh_api_token(),
    provider="huggingface",  # set your provider here hf.co/settings/inference-providers
    # provider="hyperbolic",
    # provider="nebius",
    # provider="together",
)

# Use the LangChain LLM to generate text
output = llm.invoke(text)
print(output)

### Anthropic

In [None]:
from langchain_core.prompts.prompt import PromptTemplate

template = """Question: {question}
Let's think step by step.

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["question"])
llm = Config().new_anthropic()

llm_chain = prompt | llm
print(llm_chain.invoke(text).content)

### Agentic approach

In [None]:
%uv pip install langchain-experimental~=0.3

In [None]:
from langchain.agents import initialize_agent, AgentType
from langchain_experimental.tools import PythonREPLTool

responses = ["Action: Python_REPL\nAction Input: print(2 + 2)", "Final Answer: 4"]
llm = Config().new_openai_like()

tools = [PythonREPLTool()]
agent = initialize_agent(
    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
)
result = agent("What is 2 + 2?")

print(result)

In [None]:
from langchain.agents import initialize_agent, AgentType

llm = Config().new_openai_like()

agent = initialize_agent(
    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
)
result = agent("What are the prime numbers until 20?")
print(result)

### Documentation RAG

In [None]:
%uv pip install beautifulsoup4~=4.14 lxml~=6.0 nest-asyncio~=1.6

In [None]:
from langchain_community.document_loaders import DocusaurusLoader

import nest_asyncio


nest_asyncio.apply()

loader = DocusaurusLoader("https://python.langchain.com")
# TODO: 解决内存不足导致加载很慢的问题
documents = loader.load()

In [None]:
documents[0]

In [None]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore


store = LocalFileStore("./_cache/")

underlying_embeddings = Config().new_openai_like_embeddings()

# Avoiding unnecessary costs by caching the embeddings.
embeddings = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [None]:
%uv pip install langchain-text-splitters~=0.3

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(documents)

In [None]:
%uv pip install langchain-chroma~=0.2

In [None]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [None]:
llm = Config().new_openai_like(model="qwen-plus-2025-09-11")

In [None]:
from langchain import hub

retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is Task Decomposition?")

### Repository RAG

In [None]:
%uv pip install GitPython~=3.1

In [None]:
import os
from git import Repo


# Clone the book repository from GitHub
repo_path = os.path.expanduser("_generative_ai_with_langchain")
# this directory should not exist yet!
repo = Repo.clone_from(
    "https://github.com/benman1/generative_ai_with_langchain", to_path=repo_path
)

In [None]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

loader = GenericLoader.from_filesystem(
    repo_path,
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language="python", parser_threshold=500),
)
documents = loader.load()
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
# Split the Document into chunks for embedding and vector storage
texts = python_splitter.split_documents(documents)

In [None]:
%uv pip install transformers~=4.56

In [None]:
# TODO: 解决执行失败的问题

from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma

# from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

# Store the documents in a vector store
db = Chroma.from_documents(texts, Config().new_openai_like_embeddings())
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 8})

# Create a retrieval chain for Q&A over code
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
    ]
)
llm = Config().new_openai_like()

document_chain = create_stuff_documents_chain(llm, prompt)
qa = create_retrieval_chain(retriever, document_chain)

In [None]:
# Ask a question from the book
question = "What examples are in the code related to software development?"
result = qa.invoke({"input": question})
print(result["answer"])

## Applying LLM agents for data science
### Training an ML model
#### Setting up a Python-capable agent

In [None]:
from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain.agents.agent_types import AgentType


llm = Config().new_openai_like_coder()

agent_executor = create_python_agent(
    llm=llm,
    tool=PythonREPLTool(),
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)

#### Asking the agent to build a neural network

In [None]:
result = agent_executor.run(
    """Understand, write a single neuron neural network in PyTorch.
Take synthetic data for y=2x. Train for 1000 epochs and print every 100 epochs.
Return prediction for x = 5"""
)

print(result)

#### Agent execution and results

### Analyzing a dataset
#### Creating a pandas DataFrame agent

In [None]:
%uv pip install pandas~=2.3 scikit-learn~=1.7 tabulate~=0.9

In [None]:
from sklearn.datasets import load_iris

df = load_iris(as_frame=True)["data"]

df.to_csv("iris.csv", index=False)

In [None]:
print(df)

In [None]:
from langchain_experimental.agents.agent_toolkits.pandas.base import (
    create_pandas_dataframe_agent,
)
from langchain import PromptTemplate

PROMPT = (
    "If you do not know the answer, say you don't know.\n"
    "Think step by step.\n"
    "\n"
    "Below is the query.\n"
    "Query: {query}\n"
)
prompt = PromptTemplate(template=PROMPT, input_variables=["query"])

llm = Config().new_openai_like()
# llm = Config().new_openai_like_coder()

agent = create_pandas_dataframe_agent(llm, df, verbose=True, allow_dangerous_code=True)

#### Asking questions about the dataset

In [None]:
agent.run(prompt.format(query="What's this dataset about?"))

In [None]:
agent.run(
    prompt.format(
        query="Which row has the biggest difference between petal length and petal width?"
    )
)

In [None]:
%uv pip install matplotlib~=3.10

In [None]:
agent.run(prompt.format(query="Show the distributions for each column visually!"))