In [1]:
import pinecone
from google.cloud import bigquery
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from pydantic import BaseModel
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access environment variables
api_key = os.getenv("OPENAI_KEY")
pinecone_key = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')

  from tqdm.autonotebook import tqdm


In [2]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

In [3]:
from langchain.document_loaders import BigQueryLoader
# PROJECT = "wagon-bootcamp-377120"
# DATASET = "g_adventures_dataset"
# TABLE = "one_month"

# BASE_QUERY = f"""
#     SELECT *
#     FROM {PROJECT}.{DATASET}.{TABLE}
#     WHERE tour_name = 'Highlights of Jordan'
#     """

# loader = BigQueryLoader(BASE_QUERY)

# docs = loader.load()

In [4]:
PROJECT = "wagon-bootcamp-377120"
DATASET = "g_adventures_dataset"
TABLE = "one_month"

In [5]:
class UserTravelDetails(BaseModel):
    country: str
    max_budget: float
    min_budget: float
    departing_after: str
    departing_before: str
    max_duration: int
    min_duration: int

In [6]:
# Example usage:
user_travel_details = UserTravelDetails(
    country="Morocco",
    max_budget=1000,
    min_budget=0,
    departing_after="2024-04-01",
    departing_before="2024-04-30",
    max_duration=10,
    min_duration=5
)

In [7]:
def list_all_column_names():
    client = bigquery.Client(project=PROJECT)
    dataset_ref = client.dataset(DATASET)
    table_ref = dataset_ref.table(TABLE)

    # Get the table schema
    table = client.get_table(table_ref)

    # Extract and return the column names
    column_names = [field.name for field in table.schema]

    return column_names

In [8]:
found_itineraries = ['Morocco Kasbahs & Desert']

In [9]:
def generate_sql_query(user_travel_details, found_itineraries):
    
    all_columns = list_all_column_names()
    all_columns_str = ",\n".join(all_columns)
    all_columns_str = all_columns_str + ","
    
    room_categories = [category for category in all_columns if "Adult" in category and "Promotion Description" not in category]
    
    category_cases = []

    for category in room_categories:
        category_case = f"CASE WHEN {category} > 0 THEN {category} ELSE 1000000 END"
        category_cases.append(category_case)

    category_cases_str = ",\n".join(category_cases)

    query = f"""
    SELECT
        MAX(tour_operator) AS tour_operator,
        tour_name,
        itinerary_name,
        MAX(visited_countries) AS visited_countries,
        MAX(currency) AS currency,
        ARRAY_AGG(DISTINCT cost) AS Costs,
        MAX(duration) AS duration,
        ARRAY_AGG(CAST(start_date AS STRING) ORDER BY start_date) AS start_dates,
        MAX(Travel_Style) AS Travel_Style,
        MAX(Service_Level) AS Service_Level,
        MAX(Physical_Grading) AS Physical_Grading,
        MAX(Merchandising) AS Merchandising,
        MAX(Trip_Type) AS Trip_Type,
        MAX(itinerary) AS itinerary,
        MAX(url) AS url
FROM (
    SELECT
        {all_columns_str}
        LEAST(
            {category_cases_str}
        ) AS cost
    FROM {PROJECT}.{DATASET}.{TABLE}
) AS subquery
    WHERE 1 = 1
    """

    
    # Iterate through the provided filter criteria and add them to the query
    if user_travel_details.max_budget:
        query += f" AND cost <= {user_travel_details.max_budget}"

    if user_travel_details.departing_after:
        query += f" AND start_date >= '{user_travel_details.departing_after}'"

    if user_travel_details.departing_before:
        query += f" AND start_date <= '{user_travel_details.departing_before}'"

    query += f""" AND tour_name = '{found_itineraries[0]}'
    GROUP BY
        tour_name, itinerary_name;"""
    
    return query

In [10]:
query = generate_sql_query(user_travel_details, found_itineraries)

loader = BigQueryLoader(query)

docs = loader.load()

In [11]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs)

In [12]:
pinecone.init(api_key=pinecone_key,
              environment=pinecone_env)

In [13]:
pinecone.create_index("chatbot", dimension=1536)

In [14]:
docsearch = Pinecone.from_documents(texts, embeddings, index_name="chatbot")

In [15]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=api_key,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

In [16]:
import time
time.sleep(20)

In [17]:
query = "what are the departure dates?"

In [20]:
answer = qa.run(query)
print(answer)

The departure dates for the Morocco Kasbahs & Desert tour with G Adventures are as follows:

- 2024-04-02
- 2024-04-04
- 2024-04-06
- 2024-04-11
- 2024-04-13
- 2024-04-16
- 2024-04-20
- 2024-04-21

Please note that these dates are subject to availability and may change. It is always best to check with G Adventures or visit their website for the most up-to-date information.


In [19]:
# pinecone.delete_index("chatbot")