Create a test sql database from titanic dataset.

https://python.langchain.com/docs/use_cases/sql/csv/

In [40]:
import pandas as pd
from pyprojroot import here

In [41]:
df = pd.read_csv(here("data/csv_xlsx/lengths_0_planefit_wide_None.csv"))
print(df.shape)
print(df.columns.tolist())
display(df.head(3))

(426, 28)
['Unnamed: 0', 'timestamp', 'time', 'sample', 'tooth_number', 'tooth_length_pixel', 'reference_length_pixel', 'length_best_reported', 'length_mean_reported', 'tooth_length_disp', 'tooth_length_depth', 'tooth_length_disp_gt', 'tooth_length_depth_gt', 'z_ratio', 'tilt_angle', 'plane_normal_angle', 'input_gmp_pth', 'bucket_width_m', 'bucket_width_cm_gt', 'bucket_width_pixel_tip_gt', 'bucket_width_pixel_base_gt', 'bucket_width_pixel_tip_edge', 'bucket_width_pixel_base_edge', 'z_value', 'disparity_available', 'plane_angle', 'quantisation_levels', 'pix_to_cm']


Unnamed: 0.1,Unnamed: 0,timestamp,time,sample,tooth_number,tooth_length_pixel,reference_length_pixel,length_best_reported,length_mean_reported,tooth_length_disp,...,bucket_width_cm_gt,bucket_width_pixel_tip_gt,bucket_width_pixel_base_gt,bucket_width_pixel_tip_edge,bucket_width_pixel_base_edge,z_value,disparity_available,plane_angle,quantisation_levels,pix_to_cm
0,0,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,1,124.352666,,78,71.5,,...,0.0,807.4,820.32,807,821,7.125961,True,0.436517,0,
1,1,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,2,122.355162,85.793313,76,69.75,,...,0.0,807.4,820.32,807,821,7.066157,True,0.436517,0,66.288197
2,2,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,3,118.232639,85.755397,70,65.75,,...,0.0,807.4,820.32,807,821,7.010433,True,0.436517,0,64.083064


### **SQL**

Using SQL to interact with CSV data is the recommended approach because it is easier to limit permissions and sanitize queries than with arbitrary Python.

Most SQL databases make it easy to load a CSV file in as a table (DuckDB, SQLite, etc.). Once you’ve done this you can use all of the chain and agent-creating techniques outlined in the SQL use case guide. Here’s a quick example of how we might do this with SQLite:

In [74]:
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine
db_path = str(here("data")) + "/test_sqldb.db"
db_path = f"sqlite:///{db_path}"

engine = create_engine(db_path)
# df.to_sql("titanic", engine, index=False)
# df.to_sql("wm_new2", engine, index=False)

For multiple csv files, we can create a sql with multiple tables:
```
df1.to_sql("csv1_name", engine, index=False)
df2.to_sql("csv2_name", engine, index=False)
```

In [75]:
db = SQLDatabase(engine=engine)
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM wm WHERE sample < 2;")

sqlite
['wm', 'wm_new', 'wm_new2']


"[(0, '[1.70193019e+09 1.70193032e+09]', '2023-12-07 06:25:19.372005+00:00', 1, 1, 124.35266623599183, None, 78, 71.5, None, None, 76.33, None, 0.9925191, 99.88246036430064, 0.4365168202351145, '/AIR/Projects/ShovelMetrics/SMG3/WM_labeling/mmpro_gmps/1339/1701930320-88cd7eef454c.gmp', 5.129058872810518, 0.0, 807.4000000000001, 820.32, 807, 821, 7.1259613, 1, 0.4365168202351145, 0, None), (1, '[1.70193019e+09 1.70193032e+09]', '2023-12-07 06:25:19.372005+00:00', 1, 2, 122.3551617219314, 85.79331267645519, 76, 69.75, None, None, 74.03, None, 0.9925191, 99.88246036430064, 0.4365168202351145, '/AIR/Projects/ShovelMetrics/SMG3/WM_labeling/mmpro_gmps/1339/1701930320-88cd7eef454c.gmp', 5.129058872810518, 0.0, 807.4000000000001, 820.32, 807, 821, 7.066157, 1, 0.4365168202351145, 0, 66.28819673705365), (2, '[1.70193019e+09 1.70193032e+09]', '2023-12-07 06:25:19.372005+00:00', 1, 3, 118.23263931757594, 85.7553974977668, 70, 65.75, None, None, 71.19, None, 0.9925191, 99.88246036430064, 0.43651682

**Equivalent in Pandas**

In [44]:
df[df["sample"]<2]

Unnamed: 0.1,Unnamed: 0,timestamp,time,sample,tooth_number,tooth_length_pixel,reference_length_pixel,length_best_reported,length_mean_reported,tooth_length_disp,...,bucket_width_cm_gt,bucket_width_pixel_tip_gt,bucket_width_pixel_base_gt,bucket_width_pixel_tip_edge,bucket_width_pixel_base_edge,z_value,disparity_available,plane_angle,quantisation_levels,pix_to_cm
0,0,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,1,124.352666,,78,71.50,,...,0.000000,807.40,820.32,807,821,7.125961,True,0.436517,0,
1,1,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,2,122.355162,85.793313,76,69.75,,...,0.000000,807.40,820.32,807,821,7.066157,True,0.436517,0,66.288197
2,2,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,3,118.232639,85.755397,70,65.75,,...,0.000000,807.40,820.32,807,821,7.010433,True,0.436517,0,64.083064
3,3,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,4,118.250281,,69,65.75,,...,0.000000,807.40,820.32,807,821,6.980426,True,0.436517,0,
4,4,[1.70193019e+09 1.70193032e+09],2023-12-07 06:25:19.372005+00:00,1,5,121.212574,80.748769,73,70.00,,...,0.000000,807.40,820.32,807,821,6.977421,True,0.436517,0,69.771669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,115,[1.70193345e+09 1.70193349e+09],2023-12-07 07:18:06.664179+00:00,1,2,109.705528,88.663014,67,67.00,,...,5.076952,743.27,789.90,750,787,7.445333,True,0.882899,0,57.511325
116,116,[1.70193345e+09 1.70193349e+09],2023-12-07 07:18:06.664179+00:00,1,3,105.038131,88.489186,63,63.00,,...,5.076952,743.27,789.90,750,787,7.459494,True,0.882899,0,55.172686
117,117,[1.70193345e+09 1.70193349e+09],2023-12-07 07:18:06.664179+00:00,1,4,103.005352,87.221996,64,64.00,,...,5.076952,743.27,789.90,750,787,7.414657,True,0.882899,0,54.890997
118,118,[1.70193345e+09 1.70193349e+09],2023-12-07 07:18:06.664179+00:00,1,5,112.281319,87.687943,70,70.00,,...,5.076952,743.27,789.90,750,787,7.296549,True,0.882899,0,59.516169


### **Create an agent to interact with the Database**

In [45]:
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
print("Environment variables are loaded:", load_dotenv())
print("test by reading a variable:", os.getenv("OPENAI_API_TYPE"))

Environment variables are loaded: True
test by reading a variable: openai


In [None]:
# from langchain_huggingface import HuggingFaceEndpoint

# llm = HuggingFaceEndpoint(
#     repo_id="meta-llama/Meta-Llama-3-8B-instruct",
#     task="text-generation",
#     max_new_tokens=250,
#     temperature=0.1,
# )
# llm.invoke("Hugging Face is")

" a popular open-source library for natural language processing (NLP) tasks. It provides a wide range of pre-trained models and a simple interface for using them. In this tutorial, we'll explore how to use Hugging Face's Transformers library to perform sentiment analysis on a text dataset.\n\n### Installing the required libraries\n\nBefore we start, make sure you have the following libraries installed:\n```\npip install transformers torch\n```\n### Loading the dataset\n\nFor this tutorial, we'll use the IMDB dataset, which is a popular dataset for sentiment analysis. The dataset contains 50,000 movie reviews from IMDB, with each review labeled as either positive or negative.\n\nYou can download the dataset using the following code:\n```python\nimport pandas as pd\n\n# Download the IMDB dataset\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# Load the dataset\ntrain_data = fetch_20newsgroups(subset='train')\ntest_data = f

In [118]:
from langchain_community.llms import Ollama

llm = Ollama(model = "llama3",
             temperature=0,
             top_p=0.5
            )

In [126]:
from langchain_community.agent_toolkits import create_sql_agent
from langchain.agents import AgentType

agent_executor = create_sql_agent(llm, db=db, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, max_iterations=10, agent_executor_kwargs={"handle_parsing_errors":True})


In [131]:
db.run("SELECT MAX(length_best_reported) FROM wm_new2 WHERE sample < 2;")

'[(88,)]'

In [133]:
agent_executor.invoke(
    "For data with number of samples less than 2, what's the maximum value of length_best_reported in wm_new?"
)



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3mLet's start by listing all the tables in the database.

Action: sql_db_list_tables
Action Input: empty string[0m[38;5;200m[1;3mwm, wm_new, wm_new2[0m[32;1m[1;3mThought: Now that I know the tables in the database, I should focus on the table "wm_new" since it's relevant to the question. Let me check the schema of this table.

Action: sql_db_schema
Action Input: wm_new[0m[33;1m[1;3m
CREATE TABLE wm_new (
	"Unnamed: 0" BIGINT, 
	timestamp TEXT, 
	time TEXT, 
	sample BIGINT, 
	tooth_number BIGINT, 
	tooth_length_pixel FLOAT, 
	reference_length_pixel FLOAT, 
	length_best_reported BIGINT, 
	length_mean_reported FLOAT, 
	tooth_length_disp FLOAT, 
	tooth_length_depth FLOAT, 
	tooth_length_disp_gt FLOAT, 
	tooth_length_depth_gt FLOAT, 
	z_ratio FLOAT, 
	tilt_angle FLOAT, 
	plane_normal_angle FLOAT, 
	input_gmp_pth TEXT, 
	bucket_width_m FLOAT, 
	bucket_width_cm_gt FLOAT, 
	bucket_width_pixel_tip_gt FLOAT, 
	bucket_width_

{'input': "For data with number of samples less than 2, what's the maximum value of length_best_reported in wm_new?",
 'output': 'The maximum value of length_best_reported in wm_new for data with number of samples less than 2 is 88.'}

In [110]:
agent_executor.invoke(
    "What's the minimum value of length_best_reported in the wm database?"
)



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3mLet's start by listing the tables in the database.

Action: sql_db_list_tables
Action Input: empty string[0m[38;5;200m[1;3mwm, wm_new, wm_new2[0m[32;1m[1;3mThought: Now that I know the tables in the database, I should query the schema of the relevant tables to see what columns they have. Since I'm looking for the minimum value of length_best_reported, I'll focus on the tables that might have this column.

Action: sql_db_schema
Action Input: wm, wm_new, wm_new2[0m[33;1m[1;3m
CREATE TABLE wm (
	"Unnamed: 0" BIGINT, 
	timestamp TEXT, 
	time TEXT, 
	sample BIGINT, 
	tooth_number BIGINT, 
	tooth_length_pixel FLOAT, 
	reference_length_pixel FLOAT, 
	length_best_reported BIGINT, 
	length_mean_reported FLOAT, 
	tooth_length_disp FLOAT, 
	tooth_length_depth FLOAT, 
	tooth_length_disp_gt FLOAT, 
	tooth_length_depth_gt FLOAT, 
	z_ratio FLOAT, 
	tilt_angle FLOAT, 
	plane_normal_angle FLOAT, 
	input_gmp_pth TEXT, 
	bucket_wi

{'input': "What's the minimum value of length_best_reported in the wm database?",
 'output': 'Agent stopped due to iteration limit or time limit.'}