In [None]:
#Import Elascticsearch and helpers from  elasticsearch

from elasticsearch import Elasticsearch
from pprint import pprint
from openai import OpenAI
import pandas as pd
import openai
import json

es = Elasticsearch(['http://localhost:9200'])

# Initialize OpenAI client with API key
client = OpenAI(api_key="YOUR-OPEN-AI-KEY-HERE")


# Define model and embedding function


EMBEDDING_MODEL = "text-embedding-ada-002"
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding



In [None]:
# Pretty print function for businesses reviewed

def pretty_response_business(response):
    for hit in response['hits']['hits']:
        id = hit['_id']
        score = hit['_score']
        title = hit['_source']['name']
        closed = hit['_source']['closed']
        review_count = hit['_source']['review_count']
        categories = hit['_source']['categories']
        rating = hit['_source']['rating']
        coordinates = hit['_source']['coordinates']
        transactions = hit['_source']['transactions']
        location = hit['_source']['location']
        phone = hit['_source']['phone']
        
        pretty_output = (
            f"\nID: {id}\nScore: {score}\nTitle: {title}\nClosed: {closed}\nReview Count: {review_count}\n"
            f"Categories: {categories}\nRating: {rating}\nCoordinates: {coordinates}\n"
            f"Transactions: {transactions}\nLocation: {location}\nPhone: {phone}"
        )
        print(pretty_output)


# Pretty print function for reviews per business

def pretty_response_reviews(response):
    # Create a dictionary to store reviews based on business ID and Biz URL
    business_reviews = {}

    for hit in response['hits']['hits']:
        business_id = hit['_source']['business_id']
        biz_url = hit['_source']['biz_url']
        reviews = hit['_source']['reviews']

        # Check if the business ID and Biz URL combination is already in the dictionary
        if (business_id, biz_url) not in business_reviews:
            business_reviews[(business_id, biz_url)] = []

        if reviews:
            if isinstance(reviews, list):  # Check if 'reviews' is a list
                # Append each review to the corresponding business ID and Biz URL in the dictionary
                business_reviews[(business_id, biz_url)].extend(reviews)
            else:
                # If 'reviews' is not a list, assume it's a single review dictionary
                business_reviews[(business_id, biz_url)].append(reviews)

    # Print the results
    for (business_id, biz_url), reviews in business_reviews.items():
        pretty_output = f"\nBusiness ID: {business_id}\nBiz URL: {biz_url}"

        for review_info in reviews:
            pretty_output += (
                f"\n\n  Review ID: {review_info['id']}\n  Review URL: {review_info['url']}\n"
                f"  Review Text: {review_info['text']}\nReview Rating: {review_info['rating']}\n"
                f"  Review Time Created: {review_info['time_created']}\n"
                f"  User Details: {review_info['user']}\n"
            )

        print(pretty_output)


In [None]:
#Searching based on chicago_yelp_bussinesses_reviewed index 

# Define question
question = 'Bob’s Pizza'

question_embedding = get_embedding(question, model=EMBEDDING_MODEL)

response = es.search(
  index="chicago_yelp_bussinesses_reviewed",
  knn={
      "field": "chicago_yelp_businesses_vector",  
      "query_vector": question_embedding,
      "k": 10,
      "num_candidates": 100
    }
)

pretty_response_business(response)

In [None]:
#Searching based on chicago_yelp_reviews_per_business index 

# Define question
question = 'Pizza'

question_embedding = get_embedding(question, model=EMBEDDING_MODEL)

response = es.search(
  index="chicago_yelp_reviews_per_business",
  knn={
      "field": "reviews.content_vector",  
      "query_vector": question_embedding,
      "k": 10,
      "num_candidates": 100
    }
)

pretty_response_reviews(response)


In [None]:
#composite search - using semantic search + scalar search

#define business question
business_question = 'pizza'
business_question_embedding = get_embedding(business_question, model=EMBEDDING_MODEL)

# Search based on chicago_yelp_businesses_reviewed index
business_response = es.search(
    index="chicago_yelp_bussinesses_reviewed",
    knn={
        "field": "chicago_yelp_businesses_vector",
        "query_vector": business_question_embedding,
        "k": 10, 
        "num_candidates": 100
    }
)


# Extract business IDs and names from the business_response
business_info = {hit['_source']['business_id']: hit['_source']['name'] for hit in business_response['hits']['hits']}
business_ids_to_search = list(business_info.keys())


# Define review question
review_question = 'chicago style deep dish pizza'

question_embedding = get_embedding(review_question, model=EMBEDDING_MODEL)
# Search based on chicago_yelp_reviews_per_business index with a range filter on rating
reviews_response = es.search(
    index="chicago_yelp_reviews_per_business",
    body={
        "query": {
            "bool": {
                "must": [
                    {
                        "nested": {
                            "path": "reviews",
                            "query": {
                                "bool": {
                                    "must": [
                                        {"range": {"reviews.rating": {"gt": 4}}},
                                        {"range": {"reviews.time_created": {"gte": "2023-01-01 00:00:00"}}}
                                    ]
                                }
                            }
                        }
                    }
                ]
            }
        },
        "knn": {
            "field": "reviews.content_vector",
            "query_vector": question_embedding,
            "k": 5,
            "num_candidates": 100
        }
    }
)

# Extract business IDs from the reviews response
review_business_ids = [hit['_source']['business_id'] for hit in reviews_response['hits']['hits']]


# Find common business IDs between business_ids_to_search and review_business_ids
common_business_ids = set(business_ids_to_search) & set(review_business_ids)


# Extract and print the reviews for common business IDs along with business names
for hit in reviews_response['hits']['hits']:
    current_business_id = hit['_source']['business_id']
    # Check if the business ID exists in the common_business_ids set
    if current_business_id in common_business_ids:
        business_name = business_info.get(current_business_id, "Business Name Not Available")
        print(f"\nBusiness ID: {current_business_id}")
        print(f"Business Name: {business_name}")
        print(f"Review Text: {hit['_source']['reviews'][0]['text']}")
        print(f"Rating: {hit['_source']['reviews'][0]['rating']}")
        print(f"Review Time Created: {hit['_source']['reviews'][0]['time_created']}")
        print("\n")

## Sanity Test:

Execute following commands from the command window/terminal:

**To verify the index got created:**
- curl -XGET "http://localhost:9200/_cat/indices?v"

**To check the structure of index:**
- curl -X GET "localhost:9200/chicago_yelp_bussinesses_reviewed?pretty"
- curl -X GET "localhost:9200/chicago_yelp_reviews_per_business?pretty"

**To check the stats of the index:**

- curl -X GET "localhost:9200/chicago_yelp_bussinesses_reviewed/_stats?pretty"
- curl -X GET "localhost:9200/chicago_yelp_reviews_per_business/_stats?pretty"


## How to delete  the index?
**To delete the index you created:**
- curl -XDELETE "http://localhost:9200/chicago_yelp_bussinesses_reviewed?pretty
- curl -XDELETE "http://localhost:9200/chicago_yelp_reviews_per_business?pretty


