## PandasDataFrameOutputParser

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import pprint
from typing import Any, Dict

import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [3]:
model = ChatOpenAI(model="gpt-5-nano", temperature=0)

In [4]:
def format_parser_output(parser_output : Dict[str, Any]) -> None:
    for key in parser_output.keys():
        parser_output[key] = parser_output[key].to_dict()
    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)

In [5]:
df = pd.read_csv("./titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
parser = PandasDataFrameOutputParser(dataframe=df)

print(parser.get_format_instructions())

The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a p

In [7]:
df_query = "Age column 을 조회해 주세요."

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()             # 부분 변수
    }
)

chain = prompt | model | parser

parser_output = chain.invoke({"query": df_query})

format_parser_output(parser_output)

{'Age': {0: 22.0,
         1: 38.0,
         2: 26.0,
         3: 35.0,
         4: 35.0}}


In [8]:
df_query = "Retrieve the first row."

parser_output = chain.invoke({"query":df_query})

format_parser_output(parser_output)

{'1': {'Age': 38.0,
       'Cabin': 'C85',
       'Embarked': 'C',
       'Fare': 71.2833,
       'Name': 'Cumings, '
               'Mrs. '
               'John '
               'Bradley '
               '(Florence '
               'Briggs '
               'Thayer)',
       'Parch': 0,
       'PassengerId': 2,
       'Pclass': 1,
       'Sex': 'female',
       'SibSp': 1,
       'Survived': 1,
       'Ticket': 'PC '
                 '17599'}}


In [9]:
df["Age"].head(5).mean()

np.float64(31.2)

In [10]:
df_query = "Retrieve the average of the Ages from row 0 to 4."

parser_output = chain.invoke({"query":df_query})

print(parser_output)

{'mean': np.float64(31.2)}


In [11]:
df_query = "Calculate average `Fare` rate."

parser_output = chain.invoke({"query":df_query})

print(parser_output)

{'mean': np.float64(29.521660000000004)}


In [12]:
df["Fare"].mean()

np.float64(29.521660000000004)