# 02.2 - Information Extraction using LangChain

In this notebook, we will explore how to use LangChain's Pydantic support for information extraction. We will create a simple Pydantic model for disease modeling, to extract information from paper abstracts.

In [None]:
%pip install --upgrade --quiet langchain langchain-community langchain-experimental langchain-openai langchain-ollama

In [None]:
%pip install --upgrade --quiet python-dotenv

Load the papers classified as modeling papers into LangChain Document objects, for later use in the pipeline.

In [None]:
import json
from genscai import paths

with open(paths.data / "training_modeling_papers.json", "r") as f:
    papers = json.load(f)

len(papers)

In [None]:
import dotenv
from genscai import paths

dotenv.load_dotenv(paths.root / "../.env")

In [None]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [None]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemma3:12b", model_provider="ollama")

In [None]:
from pydantic import BaseModel, Field
from typing import List


class DiseaseModelingInformation(BaseModel):
    diseases_modeled: List[str] = Field(description="diseases modeled using a disease modeling technique")
    disease_modeling_goals: List[str] = Field(description="goals of disease modeling")
    disease_modeling_techniques: List[str] = Field(description="disease modeling techniques used to model diseases")
    disease_modeling_locations: List[str] = Field(
        description="continents, countries, regions, and cities where disease modeling was performed"
    )


for paper in papers[:5]:
    print(paper["abstract"])
    print()

    structured_llm = llm.with_structured_output(DiseaseModelingInformation)
    results = structured_llm.invoke(paper["abstract"])

    print(results)
    print("\n\n")