<div class="alert alert-block alert-success">
    
    
### <center> Yelp Businesses/Reviews</center>
### <center> ELASTICSEARCH - OPEN AI</center>



    
<br>
    <br>
    
    
</div>

In [None]:
#install elastic search 8.12.0 package 

!conda install -y conda-forge::elasticsearch

# install packages

!python3 -m pip install -qU openai==1.7.2 pandas yelpapi pyarrow

from elasticsearch import Elasticsearch, helpers
from yelpapi import YelpAPI
from pprint import pprint

import pandas as pd
import openai
import json



In [None]:
# The following zip codes for Chicago downtown area and neighborhoods.
# Visit the following website for Chicago complete list of zip-codes for Chicago downtown.
# (https://www.seechicagorealestate.com/chicago-zip-codes-by-neighborhood.php)

import requests

df_business = pd.DataFrame()
list__business_reviews_documents = []

yelp_api = YelpAPI('YOUR-YELP-API-KEY-HERE')

chicago_downtown_zipcodes = [60601, 60602, 60603, 60604, 60605, 60606, 60607, 60608, 60610, 60611, 60612, 60616]
 
for zip_code in chicago_downtown_zipcodes:
    for x in range(2):
        response = yelp_api.search_query(categories='Restaurants+Entertainment+Nightlife', 
                                     location=zip_code, 
                                 sort_by='rating', limit=15, offset=x*15)
        result_df = pd.DataFrame(response['businesses'])
        df_business = pd.concat([result_df, df_business], axis=0, ignore_index=True)

        for review in response['businesses']:
            index_review = {  
                "_index": "chicago_yelp_reviews",
                "_type": "yelp_review",
                "_id": review['id'],
                "_source": review
            }
            list__business_reviews_documents.append(index_review)  


In [None]:
pprint(list__business_reviews_documents)

In [None]:
# Replacing all NaN values with None in columns as elasticsearch does not recognize it

df_business.fillna("None", inplace=True)

In [None]:
df_business.tail()

In [None]:
# Fetching reviews of the businesses

yelp_api_key = 'YOUR-YELP-API-KEY-HERE'

# Set Yelp API headers
headers = {
    "Authorization": f"Bearer {yelp_api_key}"
}

df_reviews = pd.DataFrame()
reviews = []

for businesses in list__business_reviews_documents:

    print('\n')
    print('\033[1m Reviews for ' + businesses['_source']['name'] + ": \033[0m")
    print('\n')
    
    # Get the business ID
    business_id = businesses['_id']
    business_url = businesses['_source']['url']
    # Build the URL for Yelp Business Reviews API
    url = "https://api.yelp.com/v3/businesses/" + business_id + "/reviews"
    req = requests.get(url, headers=headers)
    parsed = json.loads(req.text)
    # Sorting with newest reviews
    sort_reviews = sorted(parsed['reviews'], key=lambda x: x['time_created'], reverse=True)

    for review in sort_reviews:
        review['business_id'] = business_id
        review['biz_url'] = business_url

    pprint(sort_reviews)
    # Append the reviews to the list
    reviews.extend(sort_reviews)



In [None]:
# Convert the list of reviews to a DataFrame
df_reviews = pd.DataFrame(reviews)

In [None]:
df_reviews.tail()

<div class="alert alert-info">
    
### Important 

Create the OPENAI KEY by visiting this url https://platform.openai.com/api-keys.
Please note: You will have to create account on open ai
     
</br>

    
</div>

<hr style="border:5px solid orange"> </hr>

</br> 

In [None]:
## Embedding creation of business name and openAI

from openai import OpenAI
from tqdm import tqdm
import time

# Initialize OpenAI client with API key
client = OpenAI(api_key="YOUR-OPEN-AI-KEY-HERE")

business_name_categories_embeddings = []

# Function to obtain embeddings from OpenAI API
def embed(texts):
    # Make a request to OpenAI API to get embeddings
    embeddings = client.embeddings.create(
        input=texts,
        model='text-embedding-ada-002'
    )
    # Extract embeddings from the API response
    return [result.embedding for result in embeddings.data]

# Batch size for processing data
batch_size = 2000

# Initialize data structure for storing text
data = [
    [], # name and categories
]

# Embed and insert in batches
for i in tqdm(range(0, len(df_business))):
    
    name = str(df_business.iloc[i]['name']).replace("\n", "") or ''
    
    # Extract category titles and aliases directly from the list of dictionaries
    categories = df_business.iloc[i]['categories']
    category_titles = [category['title'] for category in categories]
    category_aliases = [category['alias'] for category in categories]
    
    # Join category titles and aliases into single strings
    categories_title= ' '.join(category_titles)
    categories_alias = ' '.join(category_aliases)
    
    # Merge 'name', 'category titles', and 'category aliases'
    combined_text = f"{name} {categories_title} {categories_alias}"
    
    data[0].append(combined_text)

    if len(data[0]) % batch_size == 0:
        
        print("Embedding batch...")

        embeddings_batch = embed(data[0]) 
        business_name_categories_embeddings.extend(embeddings_batch)
        data = [[]]
        print("Waiting for 1 minute before the next batch...")
        time.sleep(60)    

# Embed the remaining data if any
if len(data[0]) != 0:
    embeddings_rem = embed(data[0])
    business_name_categories_embeddings.extend(embeddings_rem)


In [None]:
# adding Generated embedding to chicago_yelp_businesses_vector column

df_business["chicago_yelp_businesses_vector"] = business_name_categories_embeddings


In [None]:
df_business.tail()

In [None]:
# Embedding creation using openAI for reviews

business_reviews_embeddings = []

# Function to obtain embeddings from OpenAI API
def embed(texts):
    # Make a request to OpenAI API to get embeddings
    embeddings = client.embeddings.create(
        input=texts,
        model='text-embedding-ada-002'
    )
    # Extract embeddings from the API response
    return [result.embedding for result in embeddings.data]

# Batch size for processing data
batch_size = 300

# Initialize data structure for storing text
data = [
    [] #reviews
]

# Embed and insert in batches
for i in tqdm(range(0, len(df_reviews))):
    
    review = str(df_reviews.iloc[i]['text']).replace("\n", "") or ''
    data[0].append(review)
    
    if len(data[0]) % batch_size == 0:
        
        print("Embedding batch...")

        embeddings_batch = embed(data[0])  # Assuming 'name' is used for embedding
        business_reviews_embeddings.extend(embeddings_batch)
        data = [[]]
        print("Waiting for 1 minute before the next batch...")
        time.sleep(60)    


if len(data[0]) != 0:
    embeddings_rem = embed(data[0])  # Assuming 'name' is used for embedding
    business_reviews_embeddings.extend(embeddings_rem)





In [None]:
# adding generated reviews embedding to content_vector column

df_reviews["content_vector"] = business_reviews_embeddings


In [None]:
df_reviews.tail()

In [None]:
# Configure Elasticsearch connection

from elasticsearch import Elasticsearch
es = Elasticsearch(['http://localhost:9200'])
es.ping()   #connection testing

<div class="alert alert-info">
    
### Important Note

ID attribute in businesses reviewed index mapping is called business_id in the business reviews mapping

     
</br>

    
</div>

<hr style="border:5px solid orange"> </hr>

</br> 

In [None]:
#Index Mapping for chicago_yelp_bussinesses_reviewed

index_mapping= {
    "properties": {
      "chicago_yelp_businesses_vector": {
          "type": "dense_vector",
          "dims": 1536,
          "index": "true",
          "similarity": "cosine"
      },
     "id": {"type": "text"}, 
     "alias": {"type": "text"},    
     "name": {"type": "text"},
     "image_url": {"type": "text"},
     "closed": {"type": "boolean"},
     "url": {"type": "keyword"},
     "review_count": {"type": "long"},
     "categories": {
        "type": "nested",
        "properties": {
          "alias": {"type": "keyword"}
        }
     },
     "rating": {"type": "long"},
     "coordinates": {
         "properties":{
             "longitute": {"type": "geo_point"},
             "latitute":{"type": "geo_point"}
         }},
     "transactions": {"type": "keyword"},
     "price":{"type":"keyword"},
      "location": {
        "properties": {
          "address1": {"type": "text"},
          "address2": {"type": "text"},
          "address3": {"type": "text"},
          "city": {"type": "text"},
          "zip_code": {"type": "keyword"},
          "country": {"type": "keyword"},
          "state": {"type": "keyword"},
          "display_address": { "type": "text"}
        }
      },
      "phone": {"type": "keyword"},
      "display_phone": {"type": "keyword"},
      "distance": {"type": "double"},
   }
}

if es.indices.exists(index="chicago_yelp_bussinesses_reviewed"):
    es.indices.delete(index="chicago_yelp_bussinesses_reviewed")

es.indices.create(index="chicago_yelp_bussinesses_reviewed", body={"mappings": index_mapping})

In [None]:
# Bulk indexing for chicago_yelp_bussinesses_reviewed

def dataframe_to_bulk_actions(df_business):
    for index, row in df_business.iterrows():
        yield {
            "_index": 'chicago_yelp_bussinesses_reviewed',
            "_id": row['id'],  
            "_source": {
                "business_id": row['id'],
                "alias": row['alias'],
                "name": row['name'],
                "image_url": row['image_url'],
                "closed": row['is_closed'],
                "url": row['url'],
                "review_count": row['review_count'],
                "categories": row['categories'],
                "rating": row['rating'],
                "coordinates": row['coordinates'],
                "transactions": row['transactions'],
                "location": row['location'],
                "phone": row['phone'],
                "display_phone": row['display_phone'],
                "distance": row['distance'],
                "price": row['price'],
                "chicago_yelp_businesses_vector": row['chicago_yelp_businesses_vector']
            }
        }


start = 0
end = len(df_business)
batch_size = 500

for batch_start in range(start, end, batch_size):
    batch_end = min(batch_start + batch_size, end)
    batch_dataframe = df_business.iloc[batch_start:batch_end]
    actions = list(dataframe_to_bulk_actions(df_business.iloc[start:end]))
    
success, failed = helpers.bulk(es, actions)
print(f"Inserted {success} records into Elasticsearch. Failed records: {failed}")
    

In [None]:
# Index mapping for chicago_yelp_reviews_per_business

index_mapping_reviews = {
    "properties": {
        "business_id": {"type": "keyword"},
        "biz_url": {"type": "keyword"},
        "reviews": {
            "type": "nested",  
            "properties": {
                "content_vector": {
                    "type": "dense_vector",
                    "dims": 1536,
                    "index": "true",
                    "similarity": "cosine"
                },
                "id": {"type": "text"},
                "url": {"type": "keyword"},
                "text": {"type": "text"},
                "rating": {"type": "integer"},
                "time_created": {
                  "type": "date",
                  "format": "yyyy-MM-dd HH:mm:ss"
                },
                "user": {
                    "properties": {
                        "id": {"type": "keyword"},
                        "profile_url": {"type": "keyword"},
                        "image_url": {"type": "text"},
                        "name": {"type": "text"},
                    }
                }
            }
        }
    }
}


if es.indices.exists(index="chicago_yelp_reviews_per_business"):
    es.indices.delete(index="chicago_yelp_reviews_per_business")
    
es.indices.create(index="chicago_yelp_reviews_per_business",body={"mappings":index_mapping_reviews})

In [None]:
# Bulk indexing for chicago_yelp_reviews_per_business

def dataframe_to_bulk_actions(df_reviews):
    for index, row in df_reviews.iterrows():
        yield {
            "_index": 'chicago_yelp_reviews_per_business',
            "_id": row['id'],
            "_source": {
                "business_id": row['business_id'],
                "biz_url": row['biz_url'],
                "reviews": [
                    {
                        "id": row['id'],
                        "url": row['url'],
                        "text": row['text'],
                        "rating": row['rating'],
                        "time_created": row['time_created'],
                        "user": row['user'],
                        "content_vector": row['content_vector']
                    }
                ]
            }
        }


start = 0
end = len(df_reviews)
batch_size = 100

for batch_start in range(start, end, batch_size):
    batch_end = min(batch_start + batch_size, end)
    batch_dataframe = df_reviews.iloc[batch_start:batch_end]
    actions = list(dataframe_to_bulk_actions(df_reviews.iloc[start:end]))
    
success, failed = helpers.bulk(es, actions)

print(f"Inserted {success} records into Elasticsearch. Failed records: {failed}")


## Sanity Test:

Execute following commands from the command window/terminal:

**To verify the index got created:**
- curl -XGET "http://localhost:9200/_cat/indices?v"

**To check the structure of index:**
- curl -X GET "localhost:9200/chicago_yelp_bussinesses_reviewed?pretty"
- curl -X GET "localhost:9200/chicago_yelp_reviews_per_business?pretty"

**To check the stats of the index:**

- curl -X GET "localhost:9200/chicago_yelp_bussinesses_reviewed/_stats?pretty"
- curl -X GET "localhost:9200/chicago_yelp_reviews_per_business/_stats?pretty"


## How to delete  the index?
**To delete the index you created:**
- curl -XDELETE "http://localhost:9200/chicago_yelp_bussinesses_reviewed?pretty
- curl -XDELETE "http://localhost:9200/chicago_yelp_reviews_per_business?pretty


