In [10]:
import os
import json
import requests
import random
import validators

from dotenv import load_dotenv
from tqdm import tqdm

In [2]:
!pip install accelerate --upgrade
!pip install -i https://pypi.org/simple/ bitsandbytes

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
Successfully installed accelerate-0.31.0
Looking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


#### Data crawling frontier

To create a high-recall search engine related to Tübingen, we need a targeted approach to crawling information from the web. If we were to randomly pick a document from the web, the probability of it being related to Tübingen is practically 0. Starting with a few well-known websites (e.g: facebook, etc) will not be effective, since these websites host information about the entire world. We will not 'come across' Tübingen-related information easily. 

So, our goal is to first ensure we collect only information about Tübingen, with as little effort (in terms of number of pages visited) as possible. As we saw in `retrieve_map_data.py`, the Google Maps API is restricted to only 20 results per API call, rendering it hard to use to create a frontier of all Tübingen related items. Hence, we now try to gather information about Tübingen from OpenStreetMaps. OpenStreetMaps has an API that accepts a bounding box and returns all [nodes](https://wiki.openstreetmap.org/wiki/Elements#Node) within that bounding box. 

Below, we filter out important nodes that people might want to search about.

In [247]:
# A list of node "types" we want to retrieve from open street maps.
# Reference: https://wiki.openstreetmap.org/wiki/Map_features
place_types = [
    'amenity',
    'building',
    'craft',
    'emergency',
    'healthcare',
    'historic',
    'leisure',
    #'natural',
    'office',
    'public_transport',
    'shop',
    'sport',
    'tourism'
]

# The coordinates for a bounding box around Tübingen and surrounding areas.
min_lat, min_lon, max_lat, max_lon = 48.4524, 8.921, 48.5659, 9.1394

total_nodes = 0
node_data = {}

for place_type in place_types:

#  Construct the Overpass QL query for nodes of a particular type
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
[out:json];
node[{place_type}]({min_lat},{min_lon},{max_lat},{max_lon});
out body;
"""

    # Execute the query
    response = requests.post(overpass_url, data=overpass_query).json()

    node_data[place_type] = response['elements']
    total_nodes += len(response['elements'])

# Dump the node data to a JSON file.
with open('../data/places_OSM.json', 'w') as fp:
    json.dump(node_data, fp, indent=4)

print(f"Gathered {total_nodes} nodes in and around Tübingen!")
print("%20s %10s"%('Category', '# Nodes'))
for place_type in place_types:
    print("%20s %10s"%(place_type, len(node_data[place_type])))


Gathered 11902 nodes in and around Tübingen!

            Category    # Nodes

             amenity       5726

            building         52

               craft        159

           emergency       1428

          healthcare        292

            historic        375

             leisure        257

              office        363

    public_transport       1261

                shop        828

               sport         80

             tourism       1081


#### Node metadata

We see that some nodes already have useful metadata that can be crawled or looked up. We segregate such nodes from nodes that require further processing.

In [7]:
# Load the node data from the JSON file.
with open('../data/places_OSM.json', 'r') as fp:
    node_data = json.load(fp)

all_nodes = sum(node_data.values(), [])   # Put all types of nodes into one list

useful_tags = [
    'name',
    'url',
    'contact:website',
    'website',
    'operator',
]

nodes_with_info = [x for x in all_nodes if len( set(useful_tags) & set(x['tags']) ) != 0]
nodes_without_info = [x for x in all_nodes if len( set(useful_tags) & set(x['tags']) ) == 0]

print(len(nodes_with_info))

5254


#### Difficult nodes

We see that some nodes do not have any useful information, but might be searched by a user. Let us use an LLM to generate keywords for such nodes.

In [72]:
random.choice(nodes_without_info)

{'type': 'node',
 'id': 9492154953,
 'lat': 48.5173896,
 'lon': 9.0616433,
 'tags': {'bicycle': 'yes',
  'colour:back': 'white',
  'colour:text': 'green',
  'direction': '65',
  'direction_northwest': 'Tü-Altstadt 0.8km;Tübingen Hbf 0.4km',
  'direction_southeast': 'Reutlingen 15km;Tü-Südstadt 1.1km',
  'guidepost:type': 'table',
  'information': 'guidepost',
  'network': 'lcn',
  'tourism': 'information'}}

We write a prompt providing an LLM with the node details, and asking it to generate some keywords if possible.

In [5]:
def generate_prompt_from_node(node):
    """
    Create a prompt that can be used to extract information about an OpenStreetMap Node. 
    
    Argument
    --------
    node: dict, a Node structure obtained from OSM.
    
    Returns
    -------
    str, the prompt asking an LLM for information about the node.
    """
    
    return f"""
I am trying to search for any searchable points in a city, Tübingen. I have retrieved the following information about a point: 

{node}


Please decide if the node is important, if yes, please give me one or two keywords I can use to Google search for this point. If not, simply return "Not Applicable". Please do not print anything else (not even explanations)! Only the keyword, or "Not Applicable". Do not create new lines. Simply output the keywords.
Response:"""

prompts = list(map(generate_prompt_from_node, nodes_without_info))

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3", device_map="cuda", load_in_4bit=True
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", padding_side="left")

tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default

model_inputs = tokenizer(
    prompts[:2], return_tensors="pt", padding=True
).to("cuda")
            
generated_ids = model.generate(max_new_tokens = 20, **model_inputs)
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2024-06-20 21:03:31.397363: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 21:03:31.397490: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 21:03:31.515815: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
outputs[1]

'\nI am trying to search for any searchable points in a city, Tübingen. I have retrieved the following information about a point: \n\n{\'type\': \'node\', \'id\': 30933690, \'lat\': 48.5319164, \'lon\': 9.0822537, \'tags\': {\'amenity\': \'recycling\', \'check_date:recycling\': \'2023-09-07\', \'opening_hours\': \'Mo-Fr 09:00-17:00\', \'recycling:glass_bottles\': \'yes\', \'recycling_type\': \'container\'}}\n\n\nPlease decide if the node is important, if yes, please give me one or two keywords I can use to Google search for this point. If not, simply return "Not Applicable". Please do not print anything else (not even explanations)! Only the keyword, or "Not Applicable". Do not create new lines. Simply output the keywords.\nResponse:\nRecycling Center, Container Depot'

#### Keyword Generation

We now try to extract all possible keywords. Note that, we do not have any other information about these nodes, so we can simply maintain a set of all possible search keywords extracted from these nodes.

In [None]:
batch_size = 64
keywords = set()
all_outputs = []

def parse_response(response):
    """
    Parse the response of an LLM, and obtain the search keywords.
    
    Arguments
    ---------
    response: str, the LLM response
    
    Returns
    -------
    str, the search keyword (return "" in case of parse error or "Not Applicable" response)
    """
    
    response = response.split('Response:')[1]
    
    if 'Not Applicable' in response:
        return ""
    
    if 'Explanation:' in response:
        response = response.split('Explanation:')[0]
    return response

for i in tqdm(range(101, 103)):  #len(nodes_without_info)//batch_size
    # process this batch of nodes - obtain keywords, update our keyword set.
    batch_prompts = prompts[batch_size * i: batch_size * (i + 1)]
    
    model_inputs = tokenizer(
        batch_prompts, return_tensors="pt", padding=True
    ).to("cuda")
    
    generated_ids = model.generate(max_new_tokens = 20, **model_inputs)
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    all_outputs.extend(outputs)
    keywords.update(map(parse_response, outputs))

print(keywords)

In [15]:
keywords

{'',
 '\n\n"Bench, Inscription"',
 '\n\n"Bench, Tübingen"',
 '\n\n"Bench, Wooden"',
 '\n\n"Concrete bench, Tübingen"',
 '\n\n"Defibrillator location Tübingen"\n"Emergency defibr',
 '\n\n"Defibrillator location, Volksbank Tübingen"',
 '\n\n"Fountain, Tübingen"',
 '\n\n"Medieval archaeological site, Tübingen"',
 '\n\n"Metal bench, Tübingen"',
 '\n\n"Outdoor furniture, Park bench"',
 '\n\n"Outdoor seating, Wooden bench"',
 '\n\n"Park bench, Tübingen"',
 '\n\n"Park bench, Wooden bench"',
 '\n\n"Public Fountain (No Drinking Water)"',
 '\n\n"Public seating, Tübingen"',
 '\n\n"Sculpture, Vogel"',
 '\n\n"Sitzbank, Tübingen"',
 '\n\n"Statue of Nymph, Tübingen art"',
 '\n\n"Tübingen, Germany, Accessible, Emergency"',
 '\n\n"Waldtoilette Tübingen"\n"Ökologisches',
 '\n\nArtwork, Historic',
 '\n\nBicycle parking, Lockers (with fee)',
 '\n\nBicycle parking, Safe loops parking Tübingen',
 '\n\nCigarette vending machine, Tübingen',
 '\n\nCoffee Shop, Snack Bar',
 '\n\nDecorative Fountain\n\n',
 '\n\n

In [20]:
with open('../data/llm_keywords.json', 'w') as fp:
    json.dump(list(keywords), fp)

In [8]:
with open('../data/llm_keywords.json', 'r') as fp:
    keywords = json.load(fp)

print(f"Found {len(keywords)} unique keywords!")

def clean_search_keywords(query):
    """
    Clean up the query to remove newlines
    """

    query = query.strip()
    return query

keywords = list(map(clean_search_keywords, keywords))
keywords


Found 692 unique keywords!


['',
 'Monument, Memorial Plaque',
 'recycling container, glass bottles',
 'Parking Underground Entrance, Tübingen',
 '"Waldtoilette Tübingen"\n"Ökologisches',
 'Parking, Tübingen',
 'Memorial, Seifert Brunnen',
 'Outdoor Seating Area, Public Bench',
 'shrine, Tübingen memorial',
 'Park bench, Outdoor seating',
 'picnic_spot, outdoor_seating',
 'table tennis court, outdoor sports facility',
 'lounge, Cafe',
 'Outdoor Seating, Park Bench',
 'Artwork, Relief',
 'defibrillator, Tübingen city hall',
 'table tennis court Tübingen',
 'bicycle parking, stands (Tübingen)',
 'Recycling container, Glass bottles recycling',
 'Outdoor furniture, Concrete bench',
 'Outdoor seating, Bench',
 'Recycling Center, Paper Recycling',
 'vending machine, dog station',
 '"Defibrillator location, Volksbank Tübingen"',
 'Parking Entrance/Exit, Tübingen',
 'Tübingen viewpoint, Tübingen direction SE-NW',
 'Post office, Mailbox',
 'Recycling Container, Paper Recycling',
 'Parking, Public Space',
 'Recycling Cente

#### Search Keywords

We now have the following:
1. ~700 keywords generated from unnamed nodes in and around Tübingen.
2. ~6K Named nodes, some of them with URLs.

We now need to use this information to create a large frontier of URLs for our crawler to crawl from. We start by aggregating all of our search keywords into a file. We also dump all direct URLs into a file for later use.

In [16]:
all_search_keywords = set()
frontier_urls = set()

all_search_keywords.update(keywords)


for node in nodes_with_info:

    # If this node has a name, add it to the search keywords list.
    if 'name' in node['tags']:
        all_search_keywords.add(node['tags']['name'] + ' Tübingen')

    if 'operator'in node['tags']:
        all_search_keywords.add(node['tags']['operator'] + ' Tübingen')

    # If this node has any reference URLs, add it to the frontier as well.
    for value in node['tags'].values():
        if validators.url(value):
            frontier_urls.add(value)


print(f"Final count of search keywords: {len(all_search_keywords)}")
print(f"Count of initial fontier obtained from direct URLs: {len(frontier_urls)}")

Final count of search keywords: 4715
Count of initial fontier obtained from direct URLs: 1415


In [17]:
# Save all the search keywords and URLs in JSON files.

with open('../data/search_keywords.json', 'w') as fp:
    json.dump(list(all_search_keywords), fp)

with open('../data/frontier_urls_initial.json', 'w') as fp:
    json.dump(list(frontier_urls), fp)