In [4]:
import pandas as pd
import requests
import csv

# load data

In [5]:
df=pd.read_csv('../data/clean_metadata_district_of_columbia.csv')

In [6]:
df=df.dropna(subset=['name'])

In [7]:
df['relative_results'] = df['relative_results'].fillna('unknown')

In [8]:
df.drop(columns =['latitude','longitude'],inplace=True)

In [9]:
df.head()

Unnamed: 0,name,address,gmap_id,description,category,avg_rating,num_of_reviews,price,hours,MISC,relative_results,url,geometry
0,Cascade Café,"Cascade Café, 599 Constitution Ave. NW, Washin...",0x89b7b7851b06ef6b:0x5f356b1eb1da27,Cafeteria-style dining at the National Gallery...,['American restaurant'],2.6,28,unknown,"[['Thursday', '11AM–3PM'], ['Friday', '11AM–3P...","{'Service options': ['Takeout', 'Dine-in', 'De...","['0x89b7b79ad5a69a43:0xce2fab5ae44aaf7f', '0x8...",https://www.google.com/maps/place//data=!4m2!3...,POINT (-77.0199082 38.8920767)
1,Joseph's Barbershop,"Joseph's Barbershop, 2624B Georgia Ave NW, Was...",0x89b7b797548dfcfd:0xe3a4b60261c60313,unknown,['Barber shop'],4.3,8,unknown,"[['Thursday', '10AM–7PM'], ['Friday', '10AM–7P...",unknown,"['0x89b7c81aec442c2f:0x2df3cce722072454', '0x8...",https://www.google.com/maps/place//data=!4m2!3...,POINT (-77.0228857 38.9249134)
2,Valero,"Valero, 1301 Bladensburg Rd NE, Washington, DC...",0x89b7b86fa9c15391:0x895562701e8dee87,unknown,['Gas station'],3.7,27,unknown,"[['Wednesday', 'Open 24 hours'], ['Thursday', ...",unknown,"['0x89b7b86edb3e8003:0xc1dde6954521707f', '0x8...",https://www.google.com/maps/place//data=!4m2!3...,POINT (-76.9783473 38.9074996)
3,Reason,"Reason, 1747 Connecticut Ave NW, Washington, D...",0x89b7b7cf68179fbb:0xf4199083bb564611,unknown,['Publisher'],4.5,8,unknown,unknown,unknown,"['0x89b7b7b9bc91a76f:0xdd1f96ed82811da1', '0x8...",https://www.google.com/maps/place//data=!4m2!3...,POINT (-77.0455958 38.913843)
4,American Israel Public Affairs Committee,"American Israel Public Affairs Committee, 251 ...",0x89b7b78bbdd7a14d:0x206c693f16b596d6,unknown,"['Non-profit organization', 'Public services',...",3.0,16,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,"['0x89b7b78dd7b61e37:0xc375c4848062ecea', '0x8...",https://www.google.com/maps/place//data=!4m2!3...,POINT (-77.01465759999999 38.9004676)


In [14]:
df.to_csv("clean_metadata_washingtondc_businesses.csv")

In [10]:
documents=df.to_dict(orient='records')

## Index data

In [11]:
from sentence_transformers import SentenceTransformer
model_name='multi-qa-MiniLM-L6-cos-v1'
model=SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [13]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.


In [14]:
from elasticsearch import Elasticsearch

es_client=Elasticsearch('http://localhost:9200')

index_settings={
        "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
        },
        "mappings":{
            "properties":{
            "gmap_id" : {"type":"keyword"},
            "name" : {"type":"text"},
            "address" : {"type":"text"},
            "description" : {"type":"text"},
            "category" : {"type":"keyword"},
            "avg_rating" : {"type":"float"},
            "num_of_reviews" : {"type":"integer"},
            "price": {"type":"text"},
            "hours": {"type":"text"},
            "MISC":{"type":"keyword"},
            "relative_results":{"type":"keyword"},            
            "url":{"type":"keyword"},
            "geometry":{"type":"geo_point"},
            "text_vector":{
                "type":"dense_vector",
                "dims":384,
                "index":True,
                "similarity":"cosine"
                },
            }
        }
    }

index_name="businesses"
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)             
            



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'businesses'})

In [15]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    try:
        description=doc['description']
        MISC=doc['MISC']    
        doc['text_vector']=model.encode(description+''+'MISC')

        # Convert geometry to the proper format
        lon, lat = map(float, doc['geometry'].replace('POINT (', '').replace(')', '').split())
        doc['geometry'] = {
            "lat": lat,
            "lon": lon
        }

        es_client.index(index=index_name,document=doc)
    except Exception as e:
        print(f"Error indexing document {doc}: {e}")

  0%|          | 0/11059 [00:00<?, ?it/s]

## Query processing

In [47]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [51]:
!pip uninstall numpy -y

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4


In [52]:
!pip install numpy==1.26.4 spacy

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4


## Retrieval

In [32]:
import spacy
nlp=spacy.load("en_core_web_sm")

def process_query(query):
    doc=nlp(query)

    location=None
    category=None
    attributes=[]
    min_rating=None

    for ent in doc.ents:
        if ent.label_ == "GPE":
            location=ent.text
        if ent.label_ == "ORG":
            category=ent.text
    for token in doc:
        if token.pos_ == "ADJ":
            attributes.append(token.text)
        elif token.text == "rating" and token.head.pos_ == "NUM":
            min_rating=float(token.head.text)
    return {
        "location" : location,
        "category" : category,
        "attributes" : attributes,
        "min_rating" : min_rating
    }

In [66]:
query='which restaurants offer good ambience , fresh air,vegan italian options , wheelchair accessbility and a 4.5 ratings in GPE(-76.9783473 38.9074996) ?'
processed_query=process_query(query)
print(processed_query)

{'location': None, 'category': None, 'attributes': ['good', 'fresh', 'italian'], 'min_rating': None}


In [64]:
def search(processed_query):
    # Elasticsearch query
    es_query = {
        "query": {
            "bool": {
                "should": [
                       {"match": {"description": " ".join(processed_query["attributes"])}}
                ],
                "filter": {
                    "geo_distance": {
                        "distance": "5km",
                        "location": processed_query["location"]
                    }
                }
            }
        }
    }
    es_results = es_client.search(index=index_name, body=es_query)

    query_embedding = encoder.encode(" ".join(processed_query["attributes"]))
    vector_results = index.query(query_embedding.tolist(), top_k=10)
    combined_results = combine_results(es_results, vector_results)
    return combined_results

 
def combine_results(es_results, vector_results):
    # Implement logic to combine and rank results
    # This is a placeholder implementation
    return es_results['hits']['hits']

In [65]:
search_results = search(processed_query)
print(search_results)

BadRequestError: BadRequestError(400, 'x_content_parse_exception', 'unsupported symbol [o] in geohash [connecticut]')

In [21]:
def elastic_search_knn(field,vector):
    knn={
    "field":field,
    "query_vector":vector,
    "k":5,
    "num_candidates" : 11000,
    
   
}

    search_query={
        "knn":knn,
        "_source":["gmap_id","name","address","description","category","avg_rating","price","hours","MISC"]
        }
    es_results=es_client.search(
        index=index_name,
        body=search_query
        )
    result_docs=[]
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs
    
    

In [28]:
def question_text_vector_knn(q):
    question = q['question']
    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q)

In [36]:
question_text_vector_knn(dict(
    question='which restaurants offer vegan options and wheelchair access and have a rating above 4 stars in washington dc area ',
    course='machine-learning-zoomcamp'
))

BadRequestError: BadRequestError(400, 'x_content_parse_exception', 'Failed to build [knn] after last required field arrived')

## RAG