# Introduction

Will test a Langchain SQLAgent as tool and a quantized Llama 2 model (from Kaggle). The objective is to see if we can use such a quantized model to replace OpenAI for SQL databases exploration.

# Install and import packages

In [None]:
!pip install -q -U langchain
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

In [None]:
!pip install torch==2.1

In [None]:
import torch
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.agents import create_sql_agent 
from langchain.agents.agent_toolkits import SQLDatabaseToolkit 
from langchain.sql_database import SQLDatabase 
from langchain.agents import AgentExecutor 
from langchain.agents.agent_types import AgentType
from time import time

In [None]:
torch.__version__

# Setup the database connection

In [None]:
custom_table_info = {
    "artists": """CREATE TABLE artists2 (
        artist_id integer NOT NULL,
        nme_id [BR] name character varying(200),
        nat_id [BR] nationality character varying(50),
        gen_id [BR] gender character varying(25),
        dt1_id [BR] birth_year integer,
        dt2_id [BR] death_year integer,
        CONSTRAINT artists_pk PRIMARY KEY (artist_id))

/*
3 rows from artists table:
"artist_id"	"name"	"nationality"	"gender"	"birth_year"	"death_year"
12	"Jüri Arrak"	"Estonian"	"Male"	1936	
19	"Richard Artschwager"	"American"	"Male"	1923	2013
22	"Isidora Aschheim"	"Israeli"	"Female"		
*/""",
    "artworks": """CREATE TABLE artworks (
        artwork_id integer NOT NULL,
        title character varying(500),
        artist_id integer NOT NULL,
        name character varying(500),
        date integer,
        medium character varying(250),
        dimensions text,
        acquisition_date text,
        credit text,
        catalogue character varying(250),
        department character varying(250),
        classification character varying(250),
        object_number text,
        diameter_cm text,
        circumference_cm text,
        height_cm text,
        length_cm text,
        width_cm text,
        depth_cm text,
        weight_kg text,
        durations integer,
        CONSTRAINT artworks_pk PRIMARY KEY (artwork_id))

/*
3 rows from artworks table:
"artwork_id"	"title"	"artist_id"	"name"	"date"	"medium"	"dimensions"	"acquisition_date"	"credit"	"catalogue"	"department"	"classification"	"object_number"	"diameter_cm"	"circumference_cm"	"height_cm"	"length_cm"	"width_cm"	"depth_cm"	"weight_kg"	"durations"
102312	"Watching the Game"	2422	"John Gutmann"	1934	"Gelatin silver print"	"9 3/4 x 6 7/16' (24.8 x 16.4 cm)"	"2006-05-11"	"Purchase"	"N"	"Photography"	"Photograph"	"397.2006"			"24.8"		"16.4"			
103321	"Untitled (page from Sump)"	25520	"Jerome Neuner"	1994	"Page with chromogenic color print and text"	"12 x 9 1/2' (30.5 x 24.1 cm)"	"2006-05-11"	"E.T. Harmax Foundation Fund"	"N"	"Photography"	"Photograph"	"415.2006.12"			"30.4801"		"24.13"			
10	"The Manhattan Transcripts Project, New York, New York, Episode 1: The Park"	7056	"Bernard Tschumi"		"Gelatin silver photograph"	"14 x 18' (35.6 x 45.7 cm)"	"1995-01-17"	"Purchase and partial gift of the architect in honor of Lily Auchincloss"	"Y"	"Architecture & Design"	"Architecture"	"3.1995.11"			"35.6"		"45.7"			
*/""",
}

In [None]:
sqlite_path = "/kaggle/input/moma-db/moma2c.db"
sqlite_uri = f"sqlite:///{sqlite_path}"
db = SQLDatabase.from_uri(sqlite_uri,custom_table_info = custom_table_info)

In [None]:
db.run("select count(*) from artists2")

# Intialize the model

In [None]:
access_token = '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'

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
time_1 = time()
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token = access_token  )
model_name = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,use_auth_token = access_token
    )
print(f"Tokenizer & pipeline: {round(time() - time_1)} sec.")

# Test the model

Let's wrap the model into a HuggingFace pipeline.

In [None]:
time_1 = time()
query_pipeline = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.001,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        torch_dtype=torch.float16,
        device_map="auto",
        max_new_tokens=2048,min_length = 10)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

In [None]:
llm_hfp = HuggingFacePipeline(pipeline=query_pipeline)

In [None]:
output = llm_hfp("Name one famous rock band.")
print(output[0:20])

In [None]:
output

# Prepare the SQLAgent 

In [None]:
from langchain.prompts.prompt import PromptTemplate

_DEFAULT_TEMPLATE = '''Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.\n Unless the user specifies in the question a specific number of examples to obtain, query for at most 5 results using the LIMIT clause as per SQLite. You can order the results to return the most informative data in the database.\nNever query for all columns from a table. You must query only the columns that are needed to answer the question. Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.\nPay attention to use date('now') function to get the current date, if the question involves \"today\".
Use the following format:

Question: Question here
SQLQuery: SQL Query to run
SSQLResult: Result of the SQLQuery

Only use the following tables:
{table_info}

If someone asks for the art table, they really mean the artworks table.

Only single quotation marks, not double quotation marks in the SQL statement (SQLQuery). Never use " in SQL statement (SQLQuery).

Question: {input}'''
PROMPT = PromptTemplate(
    input_variables=["input", "table_info", "dialect"], template=_DEFAULT_TEMPLATE
)

In [None]:
toolkit = SQLDatabaseToolkit(db=db, 
                             llm=llm_hfp)

In [None]:
agent_executor = create_sql_agent(
    llm=llm_hfp,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,handle_parsing_errors=True
)

In [None]:
db.run("SELECT COUNT(*) FROM artists2")

# Test the agent

In [None]:
QUESTION_01 = "How many artists are?"
QUESTION_01a = "How many artists2 are?"


In [None]:
question = "How many artists are?"
#agent_executor.run(question)

In [None]:
question = "How many reviews are?"
#agent_executor.run(question)

In [None]:
!pip install langchain_experimental

In [None]:
from langchain_experimental.sql import SQLDatabaseChain

In [None]:
db_chain = SQLDatabaseChain.from_llm(llm_hfp,db,verbose = True,prompt=PROMPT,use_query_checker = True,return_intermediate_steps = False)

In [None]:
db_chain

In [None]:
db_chain(QUESTION_01)

In [None]:
db_chain(QUESTION_01a)

In [None]:
db_chain.run(QUESTION_01)