## This notebook will work on converting text to SQL queries that can be run on Yelp dataset using an SQL agent

### Install Dependencies

In [2]:
%pip --quiet install langchain tqdm psycopg2 google-cloud-aiplatform==1.38.0 jq faiss-gpu transformers

Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Building wheel for jq (pyproject.toml) did not run successfully.
  exit code: 1
  
  [5 lines of output]
  running bdist_wheel
  running build
  running build_ext
  Executing: ./configure CFLAGS=-fPIC --prefix=C:\Users\sawant_pra\AppData\Local\Temp\pip-install-92sxaxhm\jq_275f238697924df283a3bb7d61d0237b\_deps\build\onig-install-6.9.8
  error: [WinError 2] The system cannot find the file specified
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for jq
  error: subprocess-exited-with-error
  
  Building wheel for hnswlib (pyproject.toml) did not run successfully.
  exit code: 1
  
  [5 lines of output]
  running bdist_wheel
  running build
  running build_ext
  building 'hnswlib' extension
  error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
  [end of

### Import libraries

In [5]:
import os
from typing import List
from tqdm import tqdm
from langchain.sql_database import SQLDatabase
from langchain.agents import create_sql_agent
from langchain.chat_models import ChatVertexAI
from langchain.document_loaders import JSONLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.embeddings import VertexAIEmbeddings
from langchain.prompts.chat import SystemMessagePromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain.prompts.chat import ChatPromptTemplate
from langchain.agents.agent_toolkits import SQLDatabaseToolkit

from .config import DATABASE_NAME, USERNAME, PASSWORD

ModuleNotFoundError: No module named 'langchain'

### Connect to Database

In [4]:
HOST = "localhost"
PORT = "5432"
pg_uri = f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE_NAME}"

db = SQLDatabase.from_uri(pg_uri)

NameError: name 'USERNAME' is not defined

### Load the LLM

In [None]:
SERVICE_ACCOUNT_KEY_PATH = 'llm-study-413709-40a30207144b.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_KEY_PATH

PROJECT = 'llm-study-413709'
LOCATION = 'us-central1'
MODEL_NAME = 'codechat-bison'

llm = ChatVertexAI(project=PROJECT, 
                   location=LOCATION, 
                   model_name=MODEL_NAME,
                   temperature=0.0, 
                   max_output_tokens=2048)

### Create custom embeddings class

In [None]:
class MyVertexAIEmbeddings(VertexAIEmbeddings, Embeddings):
    model_name = 'textembedding-gecko'
    max_batch_size = 5
    
    def embed_segments(self, segments: List) -> List:
        embeddings = []
        for i in tqdm(range(0, len(segments), self.max_batch_size)):
            batch = segments[i: i+self.max_batch_size]
            embeddings.extend(self.client.get_embeddings(batch))
        return [embedding.values for embedding in embeddings]
    
    def embed_query(self, query: str) -> List:
        embeddings = self.client.get_embeddings([query])
        return embeddings[0].values
    
embeddings = MyVertexAIEmbeddings()

### Prepare schema documents

In [None]:
tables_document = JSONLoader(file_path='./schemas/tables.jsonl', jq_schema='.', text_content=False, json_lines=True).load()
columns_document = JSONLoader(file_path='./schemas/columns.jsonl', jq_schema='.', text_content=False, json_lines=True).load()

### Helper method for retrieving matched tables from vector store

In [None]:
def get_matched_tables(query: str) -> List:
    db = FAISS.from_documents(documents=tables_document, embedding=embeddings)
    retriever = db.as_retriever(search_type='mmr', search_kwargs={'k': 5, 'lambda_mult': 1})
    matched_documents = retriever.get_relevant_documents(query=query)

    return matched_documents

### Helper method for retrieving matched columns from vector store

In [None]:
def get_matched_columns(query: str) -> List:
    db = FAISS.from_documents(documents=columns_document, embedding=embeddings)
    search_kwargs = {
        'k': 20
    }

    retriever = db.as_retriever(search_type='similarity', search_kwargs=search_kwargs)
    matched_columns = retriever.get_relevant_documents(query=query)

    return matched_columns

### Construct the prompt

In [None]:
messages = []
template = "You are a SQL master expert capable of writing complex SQL query in Postgres."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
messages.append(system_message_prompt)

human_template = """Given the following inputs:
USER_QUERY:
--
{query}
--
MATCHED_SCHEMA: 
--
{matched_schema}
--
Please construct a SQL query using the MATCHED_SCHEMA and the USER_QUERY provided above. 

IMPORTANT: Use ONLY the column names (column_name) mentioned in MATCHED_SCHEMA. DO NOT USE any other column names outside of this. 
IMPORTANT: Associate column_name mentioned in MATCHED_SCHEMA only to the table_name specified under MATCHED_SCHEMA.
NOTE: Use SQL 'AS' statement to assign a new name temporarily to a table column or even a table wherever needed. 
"""
human_message = HumanMessagePromptTemplate.from_template(human_template)
messages.append(human_message)

chat_prompt = ChatPromptTemplate.from_messages(messages)

user_question = 'Which user from Boston checked in into a SPA and then went to an Italian restaurant?'
matched_columns = get_matched_columns(user_question)

request = chat_prompt.format_prompt(query=user_question,
    matched_schema=matched_columns).to_messages()

### Build the agent

In [None]:
toolkit = SQLDatabaseToolkit(db=db, llm=llm)

agent_executor = create_sql_agent(llm=llm, 
                                  toolkit=toolkit,  
                                  top_k=10, 
                                  prompt=request, 
                                  return_intermediate_steps=True, 
                                  verbose=True)