In [None]:
!pip install snowflake-connector-python langchain openai pinecone-client python-dotenv langchain-openai langchain-pinecone

In [None]:
# Import required libraries
from dotenv import load_dotenv
load_dotenv()
import os
import pandas as pd
import snowflake.connector
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

In [3]:
# Set up Snowflake connection
PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")

connection = snowflake.connector.connect(
    user='BOA',
    password=PASSWORD,
    account='URB63596',
    warehouse='ANIMAL_TASK_WH',
    database='TRAVEL_GENIE',
    schema='TRANSFORMED_DATA_TRANSFORMED'
)

In [None]:
# Create cursor and execute query
cur = connection.cursor()
cur.execute("SELECT * FROM RESTAURANTS")
result = cur.fetchall()

# Convert to DataFrame
df_restaurants = pd.DataFrame(result, columns=[col[0] for col in cur.description])

# Remove duplicates if any exist
df_restaurants_cleaned = df_restaurants.drop_duplicates(subset=['RESTAURANT_ID'], keep='first')

# Print the shape before and after removing duplicates
print(f"Original shape: {df_restaurants.shape}")
print(f"Shape after removing duplicates: {df_restaurants_cleaned.shape}")

In [None]:
# Fetch restaurant reviews
cur.execute("SELECT * FROM RESTAURANT_REVIEWS")
result = cur.fetchall()

# Convert to DataFrame
df_restaurant_reviews = pd.DataFrame(result, columns=[col[0] for col in cur.description])

# Remove duplicates from reviews if any exist
df_reviews_cleaned = df_restaurant_reviews.drop_duplicates(subset=['REVIEW_ID'], keep='first')

# Print the shape before and after removing duplicates
print(f"Original reviews shape: {df_restaurant_reviews.shape}")
print(f"Shape after removing duplicates: {df_reviews_cleaned.shape}")

In [None]:
# Convert IDs to string type for consistent matching
df_restaurants_cleaned['RESTAURANT_ID'] = df_restaurants_cleaned['RESTAURANT_ID'].astype(str)
df_reviews_cleaned['RESTAURANT_ID'] = df_reviews_cleaned['RESTAURANT_ID'].astype(str)

# Merge restaurants and reviews data
df_merged = pd.merge(df_restaurants_cleaned, df_reviews_cleaned, on='RESTAURANT_ID')

# Print the shape of the merged dataset
print(f"Merged dataset shape: {df_merged.shape}")

In [None]:
df_merged[["CITY_x","CITY_y"]].head()

In [None]:
df_merged[["RESTAURANT_NAME_x","RESTAURANT_NAME_y"]].head()

In [None]:
df_merged[["LONGITUDE_x","LONGITUDE_y"]].head()

In [None]:
df_merged["CUISINES"]

In [None]:
df_merged[["STATE_x","STATE_y"]].head()

In [None]:
df_merged[["RATING_x","RATING_y"]].head()

In [None]:
df_merged['RESTAURANT_TIER']

In [None]:
df_merged.columns

In [7]:
# Initialize Pinecone
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

# Set up index parameters
index_name = "exp1"

# Create index if it doesn't exist
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,  # Using the same dimension as your previous example
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
df_merged.columns

In [None]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create vector store
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

# Iterate through the merged DataFrame and create embeddings
for i, row in df_merged.iterrows():
    # Combine relevant text fields - adjust these based on your columns
    text_to_embed = f"{row['RESTAURANT_NAME_x']} {row['CUISINES']} {row['REVIEW_TEXT']}"

    # Generate embedding
    vector = embeddings.embed_documents([text_to_embed])
    if len(vector) > 0:
        vector = vector[0]

    # Prepare metadata
    metadata = {
        "RESTAURANT_NAME": row['RESTAURANT_NAME_x'],
        "CITY": row['CITY_y'],
        "STATE": row['STATE_x'],
        "RATING": row['RATING_x'],
        "PRICE_CATEGORY": row['PRICE_CATEGORY'],
        "CUISINES": row['CUISINES'],
        "LATITUDE": row['LATITUDE_x'],
        "LONGITUDE": row['LONGITUDE_x'],
        "RESTAURANT_TIER": row['RESTAURANT_TIER'],
        "REVIEW_DATE": row['REVIEW_DATE'],
        "REVIEW_TEXT": row['REVIEW_TEXT'],
        "REVIEW_TITLE": row['REVIEW_TITLE'],
        "TRIP_TYPE": row['TRIP_TYPE'],
        "DIETARY_RESTRICTIONS": row['DIETARY_RESTRICTIONS'],
    }

    # Upsert into Pinecone
    index.upsert(
        vectors=[{
            "id": f"{row['RESTAURANT_ID']}_{i}",
            "values": vector,
            "metadata": metadata
        }],
        namespace="restaurants"
    )

    # Print progress every 100 records
    if i % 100 == 0:
        print(f"Processed {i} records")