## DATA read

In [12]:
import gzip
import json
from tqdm import tqdm
# Assuming 'review-Indiana_10.json.gz' is the path to your gzip file
def read_one_line(path, target_gmap_id='ChIJ0ZGKJp3Qa4gRZb4q4ZCv0v0'):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in g:
            data = json.loads(line)
            if data.get('gmap_id') == target_gmap_id:
                return data
    return "gmap_id not found."

# Example usage
path = 'datasets/indiana/meta-Indiana.json.gz'
target_gmap_id = '0x886b5b257d972ebb:0x29a436ce40733c04'
first_line = read_one_line(path, target_gmap_id)
print(first_line)

{'name': 'Sawmill Apartments', 'address': 'Sawmill Apartments, 3721 Lickridge Ln N Dr, Indianapolis, IN 46237', 'gmap_id': '0x886b5b257d972ebb:0x29a436ce40733c04', 'description': None, 'latitude': 39.7025668, 'longitude': -86.1041592, 'category': ['Apartment building'], 'avg_rating': 1.8, 'num_of_reviews': 36, 'price': None, 'hours': [['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'], ['Saturday', 'Closed'], ['Sunday', 'Closed'], ['Monday', '9AM–5PM'], ['Tuesday', '9AM–5PM'], ['Wednesday', '9AM–5PM']], 'MISC': {'Accessibility': ['Wheelchair accessible entrance']}, 'state': 'Open ⋅ Closes 5PM', 'relative_results': ['0x886b44b8804e5c91:0xc225b9a948999644', '0x886b5b32af64829b:0x1f7323081ef9dec', '0x886b5b31dfc591e3:0xf142a8c9eb079948', '0x886b5cf7d6555ac9:0x3c2918446aa09c7b', '0x886b5b20499a3fe7:0xdabc6214dd85f769'], 'url': 'https://www.google.com/maps/place//data=!4m2!3m1!1s0x886b5b257d972ebb:0x29a436ce40733c04?authuser=-1&hl=en&gl=us'}


In [10]:
import gzip
import json

def get_info_by_gmap_id(path, target_gmap_id):
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in g:
            data = json.loads(line)
            if data.get('gmap_id') == target_gmap_id:
                return data
    return "gmap_id not found."

# Example usage
path = 'datasets/indiana/review-Indiana_10.json.gz'
target_gmap_id = '0x886b5b257d972ebb:0x29a436ce40733c04'
info = get_info_by_gmap_id(path, target_gmap_id)
print(info)

{'user_id': '108987166009783498172', 'name': 'Ashley Clark', 'time': 1480971694081, 'rating': 1, 'text': "We have lived here 4 years now. Every summer, the air has to be fixed. Every winter, the heater has to be fixed. The pool is always closed and dirty. Bug issues that we have to pay for. Our water heater has caught fire before in which I had to wake my kids up to get them out to wait for fire dept. Rent goes up every year but the service of maintenance stays the same crappy service every year. There is always a leak with the water in the tub or sink. They'll fix it when they get to it and then a few months later, same problem arises. They'll charge you for everything! Can't wait to move!!! For as much as we pay for this 2 bedroom box, we could get a house. That is exactly what we'll be doing when our lease is up!", 'pics': None, 'resp': None, 'gmap_id': '0x886b5b257d972ebb:0x29a436ce40733c04'}


In [3]:
import gzip
import json
from tqdm import tqdm
from collections import defaultdict

def total_images_with_limit(path, max_images_per_gmap=50):
    gmap_images_count = defaultdict(int)  # Initialize dictionary with default int value for counting

    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in tqdm(g):
            review = json.loads(line)
            gmap_id = review.get('gmap_id')
            if gmap_id and review.get('pics'):
                # Increment count with limit
                potential_new_count = gmap_images_count[gmap_id] + len(review['pics'])
                gmap_images_count[gmap_id] = min(potential_new_count, max_images_per_gmap)

    # Calculate total count considering the limit
    total_count = sum(gmap_images_count.values())

    return total_count

# Example usage
path = 'datasets/indiana/review-Indiana_10.json.gz'
total_images = total_images_with_limit(path)
print(f"Total Image Count (with max 50 images per gmap_id): {total_images}")

7638803it [00:26, 287288.40it/s]

Total Image Count (with max 50 images per gmap_id): 304971





In [6]:
for review in matched_reviews:
    print(review)

## LLama index vector index

In [39]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.core import Document
import gzip
import json
from tqdm import tqdm
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model= embed_model


def read_lines_as_json(path):
    lines_as_json = []
    with gzip.open(path, 'rt', encoding='utf-8') as g:
        for line in tqdm(g):
            # Directly convert the line from JSON to a Python dict
            json_line = json.loads(line)
            lines_as_json.append(json_line)
    return lines_as_json

# Example usage
path = 'datasets/indiana/meta-Indiana.json.gz'
lines_as_strings = read_lines_as_json(path)
documents =  [] #Document(text=t) for t in lines_as_strings] #SimpleDirectoryReader("../paul_graham_essay/data").load_data()
for t in lines_as_strings:
    if 'gmap_id' in t:
        address_or_name = t['address'] if t['address'] else t.get('name', '')
        if address_or_name:
            documents.append(Document(text=address_or_name,
                                    metadata={'businessId': t['gmap_id']}))
gmap_id_to_data = {}
for t in lines_as_strings:
    if 'gmap_id' in t:
        gmap_id_to_data[t['gmap_id']] = t

100391it [00:02, 40127.93it/s]


Chroma Vector DB

In [40]:
import chromadb
from IPython.display import Markdown, display
# create client and a new collection
chroma_client = chromadb.EphemeralClient()
try:
    chroma_collection = chroma_client.create_collection("quickstart")
except:
    chroma_client.delete_collection("quickstart")
    chroma_collection = chroma_client.create_collection("quickstart")


# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

# Query Data
query_engine = index.as_retriever()
response = query_engine.retrieve("Majestic security")
display(Markdown(f"{response}"))

[NodeWithScore(node=TextNode(id_='9591bf74-6472-4229-a45f-2848e22f497f', embedding=None, metadata={'businessId': '0x8811ea0bdb857eb5:0x214be6860173cce3'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f5d348a1-73b5-449d-b042-866386c24010', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'businessId': '0x8811ea0bdb857eb5:0x214be6860173cce3'}, hash='1f48bc50a184edd1522a594aa8dfed7591bf47caec663c06965849670e0a0053')}, text='Best Security Fence', mimetype='text/plain', start_char_idx=0, end_char_idx=19, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.5484047419569787), NodeWithScore(node=TextNode(id_='21a8e6ff-f564-4b41-bd89-4c3d5700cfb0', embedding=None, metadata={'businessId': '0x886b28d1f9347d89:0x4aac48254d2ae1a'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4e3a3c5f-a2d6-499d-a727-f4bde4bdf9ab', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'businessId': '0x886b28d1f9347d89:0x4aac48254d2ae1a'}, hash='6f6aa81325214aee4e4f5e45660be01e733d7a2afcd67532de61015dab2d8055')}, text='A-1 Superior Lock', mimetype='text/plain', start_char_idx=0, end_char_idx=17, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.5382219241890683)]

In [41]:
response = query_engine.retrieve("where is Majestic Security")
for i, r in enumerate(response):
    print(r.text)
    print(gmap_id_to_data[r.metadata['businessId']])

Majestic Security, 3128 Lexington Park Dr, Elkhart, IN 46514
{'name': 'Majestic Security', 'address': 'Majestic Security, 3128 Lexington Park Dr, Elkhart, IN 46514', 'gmap_id': '0x8816c4b2fb8fb6a1:0x80451636e10ca83f', 'description': None, 'latitude': 41.6899261, 'longitude': -86.02416989999999, 'category': ['Security guard service', 'Business to business service', 'Public safety office', 'Security service', 'Training centre', 'Training school', 'Transportation escort service'], 'avg_rating': 4.3, 'num_of_reviews': 48, 'price': None, 'hours': [['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'], ['Saturday', 'Closed'], ['Sunday', 'Closed'], ['Monday', '9AM–5PM'], ['Tuesday', '9AM–5PM'], ['Wednesday', '9AM–5PM']], 'MISC': None, 'state': 'Open ⋅ Closes 5PM', 'relative_results': ['0x8816e8092cc37eff:0xa138075153591bc7', '0x8816ce61cc404e23:0x71a5e9e0898036a4', '0x8816e9eb8afbc539:0x7d7ee677df3fafa3', '0x8816cd46eed45c35:0x7d80db2d3b489fc3', '0x8816ebe5c65cf3ad:0xe8dede77091f4ecf'], 'url': 'http

In [42]:
import json

response = query_engine.retrieve("where is Majestic Security")
for i, r in enumerate(response):
    print(r.text)
    formatted_json = json.dumps(gmap_id_to_data[r.metadata['businessId']], indent=4)
    print(formatted_json)

Majestic Security, 3128 Lexington Park Dr, Elkhart, IN 46514
{
    "name": "Majestic Security",
    "address": "Majestic Security, 3128 Lexington Park Dr, Elkhart, IN 46514",
    "gmap_id": "0x8816c4b2fb8fb6a1:0x80451636e10ca83f",
    "description": null,
    "latitude": 41.6899261,
    "longitude": -86.02416989999999,
    "category": [
        "Security guard service",
        "Business to business service",
        "Public safety office",
        "Security service",
        "Training centre",
        "Training school",
        "Transportation escort service"
    ],
    "avg_rating": 4.3,
    "num_of_reviews": 48,
    "price": null,
    "hours": [
        [
            "Thursday",
            "9AM\u20135PM"
        ],
        [
            "Friday",
            "9AM\u20135PM"
        ],
        [
            "Saturday",
            "Closed"
        ],
        [
            "Sunday",
            "Closed"
        ],
        [
            "Monday",
            "9AM\u20135PM"
        ],
 

In [46]:
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
llm = Ollama(model="llama3", request_timeout=120.0)



messages = [
    ChatMessage(
        role="system", content="You are helpfully assistant that can help you find information about businesses."
    ),
    ChatMessage(role="user", content="what is pizza hut famous for?"),
]
resp = llm.chat(messages)
print(resp)

assistant: A delicious question!

Pizza Hut, also known as Pizza Hut, is a popular international fast-food restaurant chain that specializes in pizzas and other Italian-American cuisine. Here's what they're famous for:

1. **Pan Pizza**: Their signature dish, the Pan Pizza, is a staple. It's made with a thick crust, topped with melted cheese, sauce, and various toppings.
2. **Stuffed Crust**: A unique feature that sets them apart from other pizza chains. The crust is hollowed out and filled with seasonings or cheeses, adding an extra layer of flavor to the pizza.
3. **WingStreet**: Pizza Hut's wing menu offers a variety of flavors, including classic buffalo, honey mustard, and more. Their wings are known for being crispy on the outside and juicy on the inside.
4. **Delivery**: Pizza Hut is one of the pioneers in the pizza delivery market. They have a vast network of restaurants that offer delivery services, making it convenient for customers to enjoy their food from the comfort of thei

## RAG baseline

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import numpy as np

class LocationRAGModel:
    def __init__(self):
        self.initialize_models()
        self.business_data = {}  # Will store business metadata
        self.reviews_data = {}   # Will store user reviews

    def initialize_models(self):
        # Initialize language model
        self.model_name = "models/meta-llama/Meta-Llama-3-8B-Instruct"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.llm = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto", torch_dtype=torch.float16)

        # Initialize sentence transformer
        self.sentence_model = SentenceTransformer(
            "models/sentence-transformers/all-MiniLM-L6-v2",
            device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
        )

    def load_data(self, business_path, reviews_path):
        # Load business metadata
        with gzip.open(business_path, 'rt', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                self.business_data[data['gmap_id']] = data

        # Load user reviews
        with gzip.open(reviews_path, 'rt', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                if data['gmap_id'] not in self.reviews_data:
                    self.reviews_data[data['gmap_id']] = []
                self.reviews_data[data['gmap_id']].append(data)

    def calculate_embeddings(self, sentences):
        return self.sentence_model.encode(
            sentences=sentences,
            normalize_embeddings=True,
            batch_size=32
        )

    def search_businesses(self, query, top_k=5):
        query_embedding = self.calculate_embeddings([query])[0]
        
        business_texts = [f"{b['name']} - {b['category'][0]} - {b['description']}" for b in self.business_data.values()]
        business_embeddings = self.calculate_embeddings(business_texts)
        
        similarities = np.dot(business_embeddings, query_embedding)
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        return [list(self.business_data.values())[i] for i in top_indices]

    def format_prompt(self, query, relevant_businesses):
        prompt = f"Query: {query}\n\nRelevant Businesses:\n"
        for business in relevant_businesses:
            prompt += f"- Name: {business['name']}\n"
            prompt += f"  Category: {', '.join(business['category'])}\n"
            prompt += f"  Rating: {business['avg_rating']} ({business['num_of_reviews']} reviews)\n"
            prompt += f"  Address: {business['address']}\n\n"
            
            # Add a sample review if available
            if business['gmap_id'] in self.reviews_data:
                sample_review = self.reviews_data[business['gmap_id']][0]
                prompt += f"  Sample Review: {sample_review['text'][:100]}...\n\n"
        
        prompt += "Based on the above information, provide a concise recommendation or answer to the query."
        return prompt

    def generate_answer(self, query):
        relevant_businesses = self.search_businesses(query)
        prompt = self.format_prompt(query, relevant_businesses)
        
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.llm.device)
        
        with torch.no_grad():
            output = self.llm.generate(
                input_ids,
                max_new_tokens=75,
                do_sample=True,
                top_p=0.9,
                temperature=0.7
            )
        
        answer = self.tokenizer.decode(output[0, input_ids.shape[1]:], skip_special_tokens=True)
        return answer.strip()

# Usage example
rag_model = LocationRAGModel()
rag_model.load_data('meta-Indiana.json.gz', 'review-Indiana_10.json.gz')

query = "What's a good security service in Elkhart, Indiana?"
answer = rag_model.generate_answer(query)
print(answer)