# BigQuery QA

In [None]:
! pip install langchain chromadb sqlalchemy sqlalchemy-bigquery google-cloud-bigquery

In [None]:
import pandas as pd

from sqlalchemy import *
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *

from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.agents import AgentExecutor
from langchain.llms import VertexAI

from google.cloud import bigquery
from google.cloud import aiplatform

### Set Variables

In [None]:
PROJECT_ID = "your-project-id"
DATASET = "IBM_ATTRITION"
TABLE = "ATTRITION_TABLE"

service_account_secret_key = "/path/to/your/secret/key.json" 

sqlalchemy_url = f'bigquery://{PROJECT_ID}/{DATASET}?credentials_path={service_account_secret_key}'

### Load Data

In [None]:
df = pd.read_csv("IBM.csv")
df

### LLM: Google PaLm & VertexAI

In [None]:
# initialize LLM

aiplatform.init(project=PROJECT_ID)

llm = VertexAI(model_name='text-bison@001')

In [None]:
# test LLM

question = "What day comes after Friday?"

llm(question)

In [None]:
# Connect Sqlalchemy, LangChain, and VertexAI

db = SQLDatabase.from_uri(sqlalchemy_url)

llm = VertexAI(model_name='text-bison@001')

toolkit = SQLDatabaseToolkit(db=db, llm=llm)

agent_executor = create_sql_agent(llm=llm,
                                    toolkit=toolkit,
                                    verbose=True,                                    
                                    top_k=1000, 
                                    temperature=0.0,
                                    )

In [None]:
# the prefix parameter

print('You are an agent designed to interact with a SQL database.\nGiven an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.\nUnless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results.\nYou can order the results by a relevant column to return the most interesting examples in the database.\nNever query for all the columns from a specific table, only ask for the relevant columns given the question.\nYou have access to tools for interacting with the database.\nOnly use the below tools. Only use the information returned by the below tools to construct your final answer.\nYou MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.\n\nDO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.\n\nIf the question does not seem related to the database, just return "I don\'t know" as the answer.\n',)

In [None]:
# the format parameter

print('Use the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [{tool_names}]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question',)

### How many different Departments are there?

In [None]:
agent_executor.run("How many different Departments are there?")

In [None]:
df['Department'].nunique()

### What's the average monthly income for the Sales department?

In [None]:
agent_executor.run("What's the average monthly income for the Sales department?")

In [None]:
df[['Department', 'MonthlyIncome']].groupby('Department').mean().round(2).reset_index()

### Which department has the highest number of attrition and how far on average is the distance from home for those who attited from this department?

In [None]:
agent_executor.run("Which department has the highest number of attrition and how far on average is the distance from home for those who attited from this department?")

In [None]:
df[['Attrition','Department', 'DistanceFromHome']].groupby(['Department','Attrition']).count().reset_index()

In [None]:
df[['Attrition','Department', 'DistanceFromHome']].groupby(['Department','Attrition']).mean().round(2).reset_index()

### What is the percentage of employees who live more than 5 miles from home and who have attrited from the Sales department?

In [None]:
agent_executor.run("What is the percentage of employees who live more than 5 miles from home and who have attrited from the Sales department?")

In [None]:
df[(df['Department']=='Sales')&(df['DistanceFromHome']>5)]['DistanceFromHome'].value_counts().sum() / \
df[(df['Department']=='Sales')]['DistanceFromHome'].value_counts().sum()

In [None]:
df[(df['Department']=='Sales')&(df['DistanceFromHome'] > 5)]['Attrition'].value_counts()

In [None]:

df[(df['Department']=='Sales')&(df['Attrition']=='Yes')&(df['DistanceFromHome']>5)]['DistanceFromHome'].value_counts().sum() / \
df[(df['Department']=='Sales')&(df['Attrition']=='Yes')]['DistanceFromHome'].value_counts().sum()

### What department has the most Attrition?

In [None]:
agent_executor.run("What department has the most Attrition?")

In [None]:
df.groupby(['Department', 'Attrition']).count().reset_index().rename(columns = {'Age':'Count'})[['Department', 'Attrition', 'Count']]

### Prompt Engineering

In [None]:
data_dictionary = '''\n
\nAge = The age of each employee in years
\nAttrition = Did the employee quit the company? (A booloean datatype of Yes or No)
\nDepartment = The department the employee most recently works in
\nDistanceFromHome = The distance from home the employee must travel to their work station in miles
\nEducation = The maximum education level attained by the employee (1=High School, 2=Some College, 3=Bachelors Degree, 4=Masters Degree, 5=Doctorate Degree)
\nEducationField = The field of study the employee completed their degree in
\nEnvironmentSatisfaction = How satisfied the employee is with their environment (1=very bad, 2=bad, 3=neutral, 4=good, 5=very good)
\nJobSatisfaction = How satisfied the employee is with their job (1=very bad, 2=bad, 3=neutral, 4=good, 5=very good)
\nMaritalStatus = If the employee is married or single
\nMonthlyIncome = The net monthly income of the employee in US dollars
\nNumCompaniesWorked = The number of compaies the employee has worked at previously
\nWorkLifeBalance = How the employee rates their own work life balance (1=very bad, 2=bad, 3=neutral, 4=good, 5=very good)
\nYearsAtCompany = How many years the employee has been employed at this company
'''

In [None]:
data_description = '''\n
\nThis is a table of employee characteristics. 
\nEach row is a unique employee who is either currently employed or who was previously employed. 
\nThe Attrition column describes if the employee has attrited (Yes) or is still employed (No).  
\nPlease use units in the final answer if applicable.
\nSince the Attrition column is boolean datatype, before trying to aggregate it you'll need to convert it to an integer Int64 datatype.
'''

In [None]:
prefix = 'You are an agent designed to interact with a SQL database.\nGiven an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.\nUnless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results.\nYou can order the results by a relevant column to return the most interesting examples in the database.\nNever query for all the columns from a specific table, only ask for the relevant columns given the question.\nYou have access to tools for interacting with the database.\nOnly use the below tools. Only use the information returned by the below tools to construct your final answer.\nYou MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.\n\nDO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.\n\nIf the question does not seem related to the database, just return "I don\'t know" as the answer.\n'

prefix = prefix + f'\n\nHere is the data dictionary for this table {data_dictionary}'

prefix = prefix + f'\n\nAnd here is a description of the table {data_description}'

In [None]:
# Adding a prefix

agent_executor = create_sql_agent(llm=llm,
                                    toolkit=toolkit,
                                    verbose=True,                                    
                                    top_k=1000, 
                                    temperature=0.5,
                                    prefix = prefix
                                    )

In [None]:
agent_executor.run("What level of education has the highest attrition?")