In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_API_KEY"]=os.environ.get('LANGCHAIN_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]="Q&A_over_CSV"

In [2]:
import pandas as pd

df = pd.read_csv("titanic.csv")
print(df.shape)
print(df.columns.tolist())

(891, 12)
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [3]:
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine

engine = create_engine("sqlite:///titanic.db")
df.to_sql("titanic", engine, index=False)

ValueError: Table 'titanic' already exists.

In [4]:
db = SQLDatabase(engine=engine)
print(db.dialect)
print(db.get_usable_table_names())
print(db.run("SELECT * FROM titanic WHERE Age < 2;"))

sqlite
['titanic']
[(79, 1, 2, 'Caldwell, Master. Alden Gates', 'male', 0.83, 0, 2, '248738', 29.0, None, 'S'), (165, 0, 3, 'Panula, Master. Eino Viljami', 'male', 1.0, 4, 1, '3101295', 39.6875, None, 'S'), (173, 1, 3, 'Johnson, Miss. Eleanor Ileen', 'female', 1.0, 1, 1, '347742', 11.1333, None, 'S'), (184, 1, 2, 'Becker, Master. Richard F', 'male', 1.0, 2, 1, '230136', 39.0, 'F4', 'S'), (306, 1, 1, 'Allison, Master. Hudson Trevor', 'male', 0.92, 1, 2, '113781', 151.55, 'C22 C26', 'S'), (382, 1, 3, 'Nakid, Miss. Maria ("Mary")', 'female', 1.0, 0, 2, '2653', 15.7417, None, 'C'), (387, 0, 3, 'Goodwin, Master. Sidney Leonard', 'male', 1.0, 5, 2, 'CA 2144', 46.9, None, 'S'), (470, 1, 3, 'Baclini, Miss. Helene Barbara', 'female', 0.75, 2, 1, '2666', 19.2583, None, 'C'), (645, 1, 3, 'Baclini, Miss. Eugenie', 'female', 0.75, 2, 1, '2666', 19.2583, None, 'C'), (756, 1, 2, 'Hamalainen, Master. Viljo', 'male', 0.67, 1, 1, '250649', 14.5, None, 'S'), (789, 1, 3, 'Dean, Master. Bertram Vere', 'mal

In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

# Pandas

Instead of SQL we can also use data analysis libraries like pandas and the code generating abilities of LLMs to interact with CSV data. Again, this approach is not fit for production use cases unless you have extensive safeguards in place. For this reason, our code-execution utilities and constructors live in the langchain-experimental package.



# Chain
Most LLMs have been trained on enough pandas Python code that they can generate it just by being asked to:

In [6]:
ai_msg = llm.invoke(
    "I have a pandas DataFrame 'df' with columns 'Age' and 'Fare'. Write code to compute the correlation between the two columns. Return Markdown for a Python code snippet and nothing else."
)
print(ai_msg.content)

```python
import pandas as pd

# Assuming df is your DataFrame
correlation = df['Age'].corr(df['Fare'])
print(correlation)
```


# Create data analytics Chain
We can combine this ability with a Python-executing tool to create a simple data analysis chain. We'll first want to load our CSV table as a dataframe, and give the tool access to this dataframe:


In [7]:
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.tools import PythonAstREPLTool

df = pd.read_csv("titanic.csv")
tool = PythonAstREPLTool(locals={"df": df})
tool.invoke("df['Fare'].mean()")

32.204207968574636

# Create Tool

To help enforce proper use of our Python tool, we'll using tool calling:



In [8]:
llm_with_tools = llm.bind_tools([tool], tool_choice=tool.name)
response = llm_with_tools.invoke(
    "I have a dataframe 'df' and want to know the correlation between the 'Age' and 'Fare' columns"
)
response

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_4LnqqXbXwdGl0fP5qlKMgk5u', 'function': {'arguments': '{"query":"df[[\'Age\', \'Fare\']].corr().iloc[0, 1]"}', 'name': 'python_repl_ast'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 122, 'total_tokens': 143}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_5bd87c427a', 'finish_reason': 'stop', 'logprobs': None}, id='run-46215e35-f6f3-4249-831d-29b502c85cb5-0', tool_calls=[{'name': 'python_repl_ast', 'args': {'query': "df[['Age', 'Fare']].corr().iloc[0, 1]"}, 'id': 'call_4LnqqXbXwdGl0fP5qlKMgk5u', 'type': 'tool_call'}], usage_metadata={'input_tokens': 122, 'output_tokens': 21, 'total_tokens': 143})

In [9]:
response.tool_calls

[{'name': 'python_repl_ast',
  'args': {'query': "df[['Age', 'Fare']].corr().iloc[0, 1]"},
  'id': 'call_4LnqqXbXwdGl0fP5qlKMgk5u',
  'type': 'tool_call'}]

In [10]:
from langchain_core.output_parsers.openai_tools import JsonOutputKeyToolsParser

parser = JsonOutputKeyToolsParser(key_name=tool.name, first_tool_only=True)
(llm_with_tools | parser).invoke(
    "I have a dataframe 'df' and want to know the correlation between the 'Age' and 'Fare' columns"
)

{'query': "df[['Age', 'Fare']].corr().iloc[0, 1]"}

In [11]:
system = f"""You have access to a pandas dataframe `df`. \
Here is the output of `df.head().to_markdown()`:
{df.head().to_markdown()}
Given a user question, write the Python code to answer it. \
Return ONLY the valid Python code and nothing else. \
Don't assume you have access to any libraries other than built-in Python ones and pandas."""

In [12]:

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
code_chain = prompt | llm_with_tools | parser
code_chain.invoke({"question": "What's the correlation between age and fare"})

{'query': "df['Age'].corr(df['Fare'])"}

In [13]:
chain = prompt | llm_with_tools | parser | tool
chain.invoke({"question": "What's the correlation between Age and Fare"})

0.09606669176903887

In [23]:
from operator import itemgetter

from langchain_core.messages import ToolMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough

system = f"""You have access to a pandas dataframe `df`. \
Here is the output of `df.head().to_markdown()`:
{df.head().to_markdown()}
Given a user question, write the Python code to answer it. \
Don't assume you have access to any libraries other than built-in Python ones and pandas.
Respond directly to the question once you have enough information to answer it."""
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system,
        ),
        ("human", "{question}"),
        # This MessagesPlaceholder allows us to optionally append an arbitrary number of messages
        # at the end of the prompt using the 'chat_history' arg.
        MessagesPlaceholder("chat_history", optional=True),
    ]
)


def _get_chat_history(x: dict) -> list:
    """Parse the chain output up to this point into a list of chat history messages to insert in the prompt."""
    ai_msg = x["ai_msg"]
    tool_call_id = x["ai_msg"].additional_kwargs["tool_calls"][0]["id"]
    tool_msg = ToolMessage(tool_call_id=tool_call_id, content=str(x["tool_output"]))
    return [ai_msg, tool_msg]


chain = (
    RunnablePassthrough.assign(ai_msg=prompt | llm_with_tools)
    .assign(tool_output=itemgetter("ai_msg") | parser | tool)
    .assign(chat_history=_get_chat_history)
    .assign(response=prompt | llm | StrOutputParser())
    .pick(["tool_output", "response"])
)

In [24]:
chain.invoke({"question": "What's the correlation between Age and Fare"})



{'tool_output': 0.09606669176903912,
 'response': 'The correlation between Age and Fare is approximately 0.096. This indicates a weak positive correlation between the two variables.'}

# Printing only the final output

In [25]:
def print_final_generation(agent_input):
    # Extract the final answer from the output dictionary
    agent_output = chain.invoke({"question": agent_input})
    print(agent_output.get('response', ''))


In [26]:
while True:
    user_input = input("User: ")
    if user_input.lower() in ["quit", "exit", "q"]:
        print("Goodbye!")
        break
    print_final_generation(user_input)


Elon Musk is a business magnate, industrial designer, and engineer. He was born on June 28, 1971, in Pretoria, South Africa. Musk is the founder, CEO, and chief engineer of SpaceX; early investor, CEO, and product architect of Tesla, Inc.; and founder of The Boring Company. He was also co-founder of PayPal and Neuralink, and has proposed the Hyperloop, a high-speed vactrain transportation system. Musk is known for his ambitious vision of the future, including the colonization of Mars and the development of sustainable energy solutions.
Goodbye!
