# Package Installer with Streamlit

Streamlit turns data scripts into shareable web apps in minutes.

We can use LLMs to create data scripts utilising streamlit but there is one issues to this problem.

Often times, LLMs use packages that are not installed and they require manually installing them. This is where package comes in.

Let' say that we want to convert natural language data queries into streamlit data scripts.


In [34]:
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate, MessagesPlaceholder
from langchain.chains import LLMChain
from langchain_core.messages import SystemMessage
from dotenv import load_dotenv
import pandas as pd
import io

load_dotenv()

True

In [270]:
# prompt = ChatPromptTemplate.from_messages(
#     [
#         SystemMessagePromptTemplate.from_template(
#             template_format="jinja2",
#             template="""You are a data analyst. Your role is to provide a code snippet in Python tailored to data analysis query for use in Streamlit and install any imported packages using the provided tools.
#
# The following libraries are pre-imported and installed in the environment:
# ```
# import streamlit as st
# import pandas as pd
# import numpy as np
# ```
#
# The dataset has been already imported:
# ```python
# df = pd.read_csv("path_to_your_csv_file.csv")
# ```
#
# Dataset Information:{{dataset_info}}"""
#         ),
#         HumanMessagePromptTemplate.from_template("{query}"),
#         MessagesPlaceholder(variable_name="agent_scratchpad")
#     ]
# )

In [315]:
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(
            template_format="jinja2",
            template="""You are a data analyst. Your role is to provide a code snippet in Python tailored to data analysis query for use in Streamlit and install any imported packages using the provided tools. Your output will only contain Python code snippet without any text explanation.
Dataset Information:{{dataset_info}}
Variable Name for DataFrame: df
Libraries pre-imported include: pandas as pd, streamlit as st"""
        ),
        HumanMessagePromptTemplate.from_template("{query}"),
        MessagesPlaceholder(variable_name="agent_scratchpad")
    ]
)

In [316]:
# prompt = ChatPromptTemplate.from_messages(
#     [
#         SystemMessagePromptTemplate.from_template(
#             template_format="jinja2",
#             template="""The GPT is designed to function as a data analyst, specializing in Python. Its primary role is to provide code snippets that are specifically tailored for use in Streamlit, and install any imported packages using the provided tools.
#
# In the environment, the following libraries are pre-installed and ready to use:
# ```python
# import streamlit as st
# import pandas as pd
# import numpy as np
# ```
#
# The dataset has been already imported, as below:
# ```python
# df = pd.read_csv("path_to_your_csv_file.csv")
# ```
#
# Dataset Information:{{dataset_info}}"""
#         ),
#         HumanMessagePromptTemplate.from_template("{query}"),
#         MessagesPlaceholder(variable_name="agent_scratchpad")
#     ]
# )

In [317]:
import subprocess
import sys
from typing import List, Optional, Type, Union

from langchain_core.callbacks import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.tools import BaseTool, Tool


class PackageInstallInput(BaseModel):
    """Arguments for the PackageInstallTool."""

    package_names: Union[str, List[str]] = Field(
        ...,
        description="List of package name(s) to install",
        examples=["pandas", ["pandas", "numpy"]],
    )


class PackageInstallTool(BaseTool):
    """Tool that installs Python packages in runtime."""

    name: str = "package_install"
    args_schema: Type[BaseModel] = PackageInstallInput
    description: str = "Install Python packages dynamically during runtime as they are imported"

    def _run(
            self,
            package_names: Union[str, List[str]],
            run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> bool:
        try:
            if isinstance(package_names, str):
                package_names = [package_names]
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", *package_names]
            )
            print(f"Packages successfully installed: {', '.join(package_names)}.")
            return True
        except Exception as e:
            print("Error: " + str(e))
            return False

    async def _arun(
            self,
            package_names: List[str],
            run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
    ) -> str:
        raise NotImplementedError(f"{self.name} does not support async")

    def as_tool(self) -> Tool:
        return Tool.from_function(
            func=self._run,
            name=self.name,
            description=self.description,
            args_schema=self.args_schema,
        )

In [318]:
# prompt = ChatPromptTemplate.from_messages(
#     [
#         SystemMessagePromptTemplate.from_template(
#             template_format="jinja2",
#             template="""Your are a data analyst. Your role is to provide code snippets in Python tailored to various data analysis queries using streamlit.io, and install any imported packages using the provided tools.
#
# Dataset Information:{{dataset_info}}"""
#         ),
#         HumanMessagePromptTemplate.from_template("{query}"),
#         MessagesPlaceholder(variable_name="agent_scratchpad")
#     ]
# )

In [319]:
llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

In [320]:
df = pd.read_csv("../california_housing_train.csv")

In [321]:
def dataset_info():
    buf = io.StringIO()
    df.info(buf=buf)
    s = buf.getvalue()
    return s

In [322]:
tools = [PackageInstallTool().as_tool()]

In [323]:
from langchain.agents import AgentExecutor, create_openai_functions_agent
agent = create_openai_functions_agent(llm, tools, prompt)

In [324]:
# from langchain_community.tools.convert_to_openai import format_tool_to_openai_function
# llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])

In [325]:
# def function(x):
#     print(x)
#     return format_to_openai_function_messages(x["intermediate_steps"])

In [326]:
# from langchain.agents.format_scratchpad import format_to_openai_function_messages
# from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
#
# agent = (
#         {
#             "dataset_info": lambda x: x["dataset_info"],
#             "query": lambda x: x["query"],
#             "agent_scratchpad": lambda x: function(x),
#         }
#         | prompt
#         | llm_with_tools
#         | OpenAIFunctionsAgentOutputParser()
# )

In [327]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    return_intermediate_steps=True
)

In [328]:
params = {
    "dataset_info": dataset_info(),
    "query": "How can I plot population against long, latitude ?",
}

In [329]:
result = agent_executor.invoke(params)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `package_install` with `{'package_names': ['matplotlib', 'seaborn']}`


Collecting seaborn
  Downloading seaborn-0.13.1-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.1-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.1
Packages successfully installed: matplotlib, seaborn.
[36;1m[1;3mTrue[0m[32;1m[1;3m```python
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of seaborn
sns.set_style("whitegrid")

# Create a scatter plot of population against longitude and latitude
plt.figure(figsize=(10, 8))
sns.scatterplot(x='longitude', y='latitude', size='population', sizes=(20, 200), data=df, legend=False, alpha=0.5)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Population Distributi

In [238]:
result.keys()

dict_keys(['dataset_info', 'query', 'output', 'intermediate_steps'])

In [239]:
result['intermediate_steps']

[(AgentActionMessageLog(tool='package_install', tool_input={'package_names': ['matplotlib', 'plotly']}, log="\nInvoking: `package_install` with `{'package_names': ['matplotlib', 'plotly']}`\n\n\n", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"package_names":["matplotlib","plotly"]}', 'name': 'package_install'}})]),
  True)]

In [70]:
result

{'dataset_info': "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 17000 entries, 0 to 16999\nData columns (total 9 columns):\n #   Column              Non-Null Count  Dtype  \n---  ------              --------------  -----  \n 0   longitude           17000 non-null  float64\n 1   latitude            17000 non-null  float64\n 2   housing_median_age  17000 non-null  float64\n 3   total_rooms         17000 non-null  float64\n 4   total_bedrooms      17000 non-null  float64\n 5   population          17000 non-null  float64\n 6   households          17000 non-null  float64\n 7   median_income       17000 non-null  float64\n 8   median_house_value  17000 non-null  float64\ndtypes: float64(9)\nmemory usage: 1.2 MB\n",
 'query': 'How can I plot population against long, latitude ?',
 'output': "```python\nimport matplotlib.pyplot as plt\n\nst.map(df[['latitude', 'longitude', 'population']])\n\nplt.figure(figsize=(10,6))\nplt.scatter(df['longitude'], df['latitude'], c=df['population'], cmap='