In [48]:
%pip install overturemaps lonboard geopandas shapely
%pip install google.generativeai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [49]:
%cd Helper-functions
%run category-tree.py
%cd ..

c:\Users\chaav\OneDrive\Documents\GitHub\POI-Category-Automation\Helper-functions
Wrote category_tree.json with 22 top-level entries.
c:\Users\chaav\OneDrive\Documents\GitHub\POI-Category-Automation


In [50]:
import overturemaps as om
from overturemaps import core
import overturemaps
import pandas
import geopandas as gpd
from shapely import wkb
from lonboard import Map, PolygonLayer, ScatterplotLayer
import ipywidgets as widgets
import numpy as np
from IPython.display import display
import json
import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
import tqdm

In [51]:
with open('Data/category_tree.json', 'r') as f:
    category_tree = json.load(f)

In [52]:
def create_map(dataset):
    layer = ScatterplotLayer.from_geopandas(
        dataset,
        get_fill_color=[255, 0, 0],
        radius_min_pixels=5,
    )

    view_state = {
        "longitude": (bbox[0] + bbox[2]) / 2,
        "latitude": (bbox[1] + bbox[3]) / 2,
        "zoom": 8,
        "pitch": 45,
    }
    m = Map(layer, view_state=view_state)
    return m

In [53]:
def get_most_detailed_category(current_category, poi_metadata, tree):
    # Base Case: No further sub-categories
    if current_category not in tree or not tree[current_category]:
        return current_category  

    sub_categories = tree[current_category]

    # Example Matching Logic: Keyword search in POI name/description
    for sub_cat in sub_categories:
        if sub_cat.replace('-', ' ') in poi_metadata.lower():
            return get_most_detailed_category(sub_cat, poi_metadata, tree)

    # If no match found, return current category
    return current_category

In [54]:
def get_first_website(websites_list):
    if websites_list and isinstance(websites_list, list) and len(websites_list) > 0 and websites_list[0] and isinstance(websites_list[0], str):
        return websites_list[0]
    return ""

In [55]:
def scrape_website(url, timeout = 5):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.title.string if soup.title else 'No title found'
        meta_desc =  ""
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag:
            meta_desc = meta_tag.get('content', '')

        h1_tag = [tag.get_text(strip=True) for tag in soup.find_all('h1')]

        screaped_text = f"Title: {title}. Meta Description: {meta_desc}. Heading: {';'.join(h1_tag)}"

        return screaped_text
    
    except Exception as e:
        print(f"Failed to Scrape: {url}: {e}")
        return ""

In [56]:
# specify bounding box
bbox = 9.0894, 45.5042, 9.1094, 45.5172

In [57]:
place_dataset = core.geodataframe("place", bbox=bbox)
print(place_dataset.shape)

(251, 15)


In [58]:
# Read the categories.txt file
with open('Data/categories.txt', 'r') as file:
    categories_data = file.readlines()
categories_data = categories_data[1:]

# Split the data by semicolons and extract the part before the semi-colon
categories_list = [entry.split(';')[0] for entry in categories_data]

In [59]:
# Pulling out the places and websites
websites = {}
number = 0
for i in range(len(place_dataset.id)):
    # print(f"Place: {place_dataset.names[i]['primary']}")
    number += 1
    websites[place_dataset.names[i]['primary']] = place_dataset.websites[i]

# Save all websites as a comma-separated string
websites_str = " , ".join(
    site[0] if site is not None and len(site) > 0 else "No website Found"
    for site in websites.values()
)

# Test Features

# print(websites_str)
print(number)

251


In [60]:
# Pulling out the places' social media links
socials_dict = {}
for i in range(len(place_dataset.id)):
    place_name = place_dataset.names[i]['primary']
    # Assuming place_dataset.socials[i] contains the social media information
    # This might be a list of strings, a dictionary, or None
    social_links = place_dataset.socials[i]
    socials_dict[place_name] = social_links

# Print the extracted social media links
# print(socials_dict) 

# Example: Print socials for the first few places to check the structure
# count = 0
# for place, links in socials_dict.items():
#     if count < 5: # Print for the first 5 places
#         print(f"Place: {place}, Socials: {links}")
#         count += 1
#     else:
#         break

# If you want a string representation similar to websites_str:
# This part depends heavily on the actual structure of place_dataset.socials[i]
# Assuming it's a list of strings (URLs) or can be easily converted to a string:
socials_str_list = []
for place_name, social_links_list in socials_dict.items():
    if social_links_list:
        # If social_links_list is a list of strings:
        if isinstance(social_links_list, list) and all(isinstance(link, str) for link in social_links_list):
            links_str = ", ".join(social_links_list)
        # If social_links_list is a dictionary (e.g., {'facebook': 'url', 'instagram': 'url'}):
        elif isinstance(social_links_list, dict):
            links_str = ", ".join([f"{platform}: {url}" for platform, url in social_links_list.items()])
        # Add more conditions if other structures are possible or convert to string directly
        else:
            links_str = str(social_links_list) # Fallback to string conversion
        socials_str_list.append(f"{place_name}: [{links_str}]")
    else:
        socials_str_list.append(f"{place_name}: No socials found")

# print("\nFormatted Socials String:")
for item in socials_str_list[:14]: # Print for the first 5 places
    print(item)

Parco Nicolas: [['https://www.facebook.com/110607897987764']]
Edimol Sas: [['https://www.facebook.com/693284050783126']]
Casual Store money transfer: [['https://www.facebook.com/132504977165630']]
Molino Dorino: [['https://www.facebook.com/451512508382062']]
Atlassib: [['https://www.facebook.com/305979330228103']]
CSC - Casa salvadorena Cultural: [['https://www.facebook.com/1484762838421631']]
Prodent Italia: [['https://www.facebook.com/1346616508739611']]
Saldoplast: [['https://www.facebook.com/203914236705564']]
Swiss DolorClast Italia: [['https://www.facebook.com/378005672556733']]
Mud Museum - Open Space: [['https://www.facebook.com/171269036417134']]
PR No Ordinary Training Studio: [['https://www.facebook.com/479433592451814']]
Multiprint: [['https://www.facebook.com/1670753372951449']]
LR Health & Beauty Italia: [['https://www.facebook.com/147643092068177']]
Team Russo: [['https://www.facebook.com/102509945702311']]


In [61]:
scraped_socials_content = {}
tasks_for_social_scraping = []

# Prepare tasks for scraping
for place_name, social_links_list in socials_dict.items():
    if social_links_list is not None:
        # Ensure social_links_list is iterable (e.g., numpy array or list)
        if isinstance(social_links_list, (list, np.ndarray)):
            for social_url in social_links_list:
                if isinstance(social_url, str) and social_url.startswith(('http://', 'https://')):
                    tasks_for_social_scraping.append((place_name, social_url))
        elif isinstance(social_links_list, str) and social_links_list.startswith(('http://', 'https://')): # Handle case where it might be a single string
            tasks_for_social_scraping.append((place_name, social_links_list))


total_social_urls = len(tasks_for_social_scraping)
scraped_social_urls_count = 0
progress_step_social = max(1, total_social_urls // 20) if total_social_urls > 0 else 1

def scrape_social_url_wrapper(args):
    global scraped_social_urls_count
    place_name, url = args
    scraped_content = scrape_website(url) # Using the existing scrape_website function
    
    # Increment and print progress
    scraped_social_urls_count += 1
    if total_social_urls > 0 and (scraped_social_urls_count % progress_step_social == 0 or scraped_social_urls_count == total_social_urls):
        percent = (scraped_social_urls_count / total_social_urls) * 100
        print(f"Scraped {scraped_social_urls_count}/{total_social_urls} ({percent:.1f}%) social URLs.")
        
    return place_name, url, scraped_content if scraped_content else "Scraping Failed or No Content"

print(f"Found {total_social_urls} social URLs to scrape.")

# Use ThreadPoolExecutor for concurrent scraping
with concurrent.futures.ThreadPoolExecutor(max_workers=3000) as executor: # Adjusted max_workers for social media sites
    social_results = list(executor.map(scrape_social_url_wrapper, tasks_for_social_scraping))

# Store the results
for place_name, url, content in social_results:
    if place_name not in scraped_socials_content:
        scraped_socials_content[place_name] = []
    scraped_socials_content[place_name].append({url: content})

print("\nScraping of social URLs complete.")
print("Scraped Social Media Content:")
# Print the scraped social media content (first few for brevity)
count_printed = 0
for place, contents in scraped_socials_content.items():
    if count_printed < 10: # Limit printing for brevity
        print(f"\nPlace: {place}")
        for content_dict in contents:
            for url_key, text_content in content_dict.items():
                print(f"  URL: {url_key}")
                print(f"  Content: {text_content[:200]}...") # Print first 200 chars
        count_printed +=1
    else:
        print(f"\n... and {len(scraped_socials_content) - count_printed} more places.")
        break
        
if not scraped_socials_content:
    print("No social media content was scraped.")

# New section to print results in the requested format
print("\n--- Formatted Scraped Socials Information (Name: Scraped Content from Socials) ---")
formatted_social_info_output_list = []

if isinstance(scraped_socials_content, dict):
    for place_name_key, list_of_content_dicts in scraped_socials_content.items():
        # list_of_content_dicts is a list of dicts, e.g., [{'url1': 'scraped_text1'}, {'url2': 'scraped_text2'}]
        
        all_scraped_texts_for_this_place = []
        for single_url_content_dict in list_of_content_dicts:
            # single_url_content_dict is e.g., {'url1': 'scraped_text1'}
            # We need the values (the scraped text strings)
            for scraped_text_item in single_url_content_dict.values():
                if scraped_text_item and isinstance(scraped_text_item, str): # Ensure it's a non-empty string
                    all_scraped_texts_for_this_place.append(scraped_text_item)
        
        # Concatenate all scraped texts for this place, separated by " | "
        # This aggregated_social_info is the "socials-scraped-Information"
        aggregated_social_info = " | ".join(all_scraped_texts_for_this_place)
        
        # Escape double quotes in place_name_key and aggregated_social_info to ensure valid string literals in the output
        place_name_escaped = str(place_name_key).replace('"', '\\"')
        aggregated_social_info_escaped = aggregated_social_info.replace('"', '\\"')
        
        formatted_line = f'"{place_name_escaped}": "{aggregated_social_info_escaped}"'
        print(formatted_line)
        formatted_social_info_output_list.append(formatted_line)

    if not formatted_social_info_output_list:
        print("No social media content was available in scraped_socials_content to format.")
else:
    print("Variable 'scraped_socials_content' is not a dictionary. Cannot produce formatted social info.")

Found 185 social URLs to scrape.
Scraped 9/185 (4.9%) social URLs.
Scraped 18/185 (9.7%) social URLs.
Scraped 27/185 (14.6%) social URLs.
Scraped 36/185 (19.5%) social URLs.
Scraped 45/185 (24.3%) social URLs.
Scraped 54/185 (29.2%) social URLs.
Scraped 63/185 (34.1%) social URLs.
Scraped 72/185 (38.9%) social URLs.
Scraped 81/185 (43.8%) social URLs.
Scraped 90/185 (48.6%) social URLs.
Scraped 99/185 (53.5%) social URLs.
Scraped 108/185 (58.4%) social URLs.
Scraped 117/185 (63.2%) social URLs.
Scraped 126/185 (68.1%) social URLs.
Scraped 135/185 (73.0%) social URLs.
Scraped 144/185 (77.8%) social URLs.
Scraped 153/185 (82.7%) social URLs.
Scraped 162/185 (87.6%) social URLs.
Scraped 171/185 (92.4%) social URLs.
Scraped 180/185 (97.3%) social URLs.
Scraped 185/185 (100.0%) social URLs.

Scraping of social URLs complete.
Scraped Social Media Content:

Place: Parco Nicolas
  URL: https://www.facebook.com/110607897987764
  Content: Title: Parco del Castello di Verde. Meta Description: . H

In [62]:
# Join the list of formatted social info strings into a single comma-separated string
scraped_socials_string = ", ".join(formatted_social_info_output_list)
print(scraped_socials_string)

"Parco Nicolas": "Title: Parco del Castello di Verde. Meta Description: . Heading: ", "Edimol Sas": "Title: Log into Facebook. Meta Description: Log into Facebook to start sharing and connecting with your friends, family, and people you know.. Heading: ", "Casual Store money transfer": "Title: Log into Facebook. Meta Description: Log into Facebook to start sharing and connecting with your friends, family, and people you know.. Heading: ", "Molino Dorino": "Title: Molino Dorino. Meta Description: . Heading: ", "Atlassib": "Title: Atlassib. Meta Description: . Heading: ", "CSC - Casa salvadorena Cultural": "Title: CSC - Casa salvadorena Cultural | Milan . Meta Description: CSC - Casa salvadorena Cultural, Milano. 626 likes · 33 were here. PARA NUSTRA JUVENTUD ITALO-SALVADORENA. NACE ESTE BONITO PROYECTO DE INTEGRACION DE.... Heading: ", "Prodent Italia": "Title: Log into Facebook. Meta Description: Log into Facebook to start sharing and connecting with your friends, family, and people yo

In [63]:
# Create a string with "place: website" for each entry
place_website_str = " , ".join(
    f"{place}: {websites[place][0] if websites[place] is not None and len(websites[place]) > 0 else 'No website Found'}"
    for place in websites
)
print(place_website_str)

Parco Nicolas: No website Found , Edimol Sas: No website Found , Casual Store money transfer: http://www.casualstore.net/ , Molino Dorino: https://www.atm.it/ , Atlassib: No website Found , CSC - Casa salvadorena Cultural: No website Found , Prodent Italia: http://www.prodentitalia.eu/ , Saldoplast: http://www.saldoplast.it/ , Swiss DolorClast Italia: https://www.ems-dolorclast.com/it , Mud Museum - Open Space: http://www.museodelfango.it/ , PR No Ordinary Training Studio: http://Www.noordinarytraining.com/ , Multiprint: http://www.multiprintitalia.it/ , LR Health & Beauty Italia: http://www.lrworld.com/ , Team Russo: https://www.remax.it/trova/agenti-agenzie/agente/vincenzo-russo , Malià: https://www.malia-officinadellasarta.com/ , 5Min S&S: https://5-min.shop/ , Tempocasa Pero: https://www.tempocasapero.it/ , Pero: No website Found , Autoscuola Pero S.a.s: No website Found , Farmacia Vittoria: No website Found , Eurogreen Fiori:  , Western Union: https://www.westernunion.com/it/it/ho

In [64]:
scraped_count = 0
total_websites = len(websites)
progress_step = max(1, total_websites // 20)  # 5% step

def scrape_site_wrapper(args):
    global scraped_count
    place, site = args
    if site is not None and len(site) > 0 and site[0] != "No website Found":
        try:
            scraped_content = scrape_website(site[0])
            result = (place, scraped_content if scraped_content else "Scraping Failed")
        except Exception:
            result = (place, "Scraping Failed")
    else:
        result = (place, "No website Found")
    scraped_count += 1
    if scraped_count % progress_step == 0 or scraped_count == total_websites:
        percent = (scraped_count / total_websites) * 100
        print(f"{scraped_count}/{total_websites} ({percent:.1f}%) websites scraped")
    return result

with concurrent.futures.ThreadPoolExecutor(max_workers=3000) as executor:
    results = list(executor.map(scrape_site_wrapper, websites.items()))

scraped_websites = dict(results)
print(scraped_websites)


Failed to Scrape: http://www.casualstore.net/: HTTPConnectionPool(host='www.casualstore.net', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x00000206073EF3B0>: Failed to resolve 'www.casualstore.net' ([Errno 11001] getaddrinfo failed)"))
Failed to Scrape: : Invalid URL '': No scheme supplied. Perhaps you meant https://?
12/242 (5.0%) websites scraped
Failed to Scrape: http://Www.noordinarytraining.com/: HTTPConnectionPool(host='www.noordinarytraining.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000002060AF9EB70>: Failed to resolve 'www.noordinarytraining.com' ([Errno 11001] getaddrinfo failed)"))
Failed to Scrape: http://www.actionshooting.eu/: HTTPConnectionPool(host='www.actionshooting.eu', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x000002060A50D760>:

In [65]:
category_keywords = {
    # Accommodation
    "accommodation-hotel":          ["hotel", "albergo"],
    "accommodation-hostel":         ["hostel", "ostello"],
    "accommodation-motel":          ["motel"],
    "accommodation-resort":         ["resort"],
    "accommodation-bed_and_breakfast": ["bed and breakfast", "b&b"],

    # Food & Drink
    "eat_and_drink-restaurant":     ["restaurant", "ristorante", "trattoria", "osteria", "pizzeria", "pizza"],
    "eat_and_drink-cafe":           ["cafe", "caffè", "coffee shop", "caffetteria"],
    "eat_and_drink-bar":            ["bar", "pub", "wine bar", "enoteca"],
    "retail-food-bakery":           ["bakery", "panificio", "forno"],
    "retail-food-ice_cream_shop":   ["ice cream", "gelateria"],
    "retail-food-pastry_and_cake_shop": ["pastry shop", "pasticceria"],
    "eat_and_drink-fast_food":      ["fast food", "takeaway", "kebab", "shawarma", "sushi", "taco", "burger", "hot dog"],
    "eat_and_drink-tea_room":       ["tea room", "tearoom"],
    "eat_and_drink-juice_bar":      ["juice bar", "smoothie"],

    # Retail
    "retail":                       ["shop", "store", "market", "boutique", "emporio", "negozio"],
    "retail-food-supermarket":      ["supermarket", "grocery", "ipermercato"],
    "health_and_medical-pharmacy":  ["pharmacy", "farmacia"],
    "retail-books_stationery_music_and_film-book_shop": ["bookstore", "libreria", "librairie"],
    "retail-clothing_and_accessories-clothing_store":    ["clothing store", "abbigliamento", "fashion"],
    "retail-clothing_and_accessories-jewelry_and_watch_store": ["jewelry", "gioielleria"],
    "retail-clothing_and_accessories-shoe_store":        ["shoe store", "calzature"],
    "retail-toys_and_games_store":     ["toy store", "giocattoli"],
    "retail-home_and_garden-florist":  ["flower shop", "florist", "fioraio"],
    "retail-electronics-consumer_electronics_store": ["electronics store", "elettronica"],
    "retail-home_and_garden-hardware_store": ["hardware store", "ferramenta"],
    "retail-department_store":         ["department store", "grande magazzino"],
    "retail-shopping_center_and_mall":["mall", "shopping center", "centro commerciale"],
    "retail-beverage_store-wine_and_spirits_store": ["wine shop", "enoteca"],
    "retail-books_stationery_music_and_film-newsagent_and_kiosk": ["newsstand", "edicola"],
    "retail-tobacconist":              ["tobacco shop", "tabaccheria", "tabacchi"],

    # Arts & Entertainment
    "attractions_and_activities-museum":     ["museum", "museo"],
    "arts_and_entertainment-movie_theater": ["cinema", "movie theater"],
    "arts_and_entertainment-performing_arts_theater":["theater", "teatro"],
    "attractions_and_activities-art_gallery": ["art gallery", "galleria d'arte"],
    "arts_and_entertainment-topic_concert_venue":["music venue", "concert hall"],
    "arts_and_entertainment-night_club":    ["nightclub", "discoteca", "disco"],
    "attractions_and_activities-amusement_park":["amusement park", "theme park"],
    "attractions_and_activities-aquarium":   ["aquarium", "acquario"],
    "attractions_and_activities-zoo":        ["zoo", "bioparco"],
    "arts_and_entertainment-bowling_alley":  ["bowling", "bowling alley"],
    "active_life-sports_and_recreation_venue-gym_and_fitness_center": ["gym", "fitness", "palestra"],
    "active_life-sports_and_recreation_venue-stadium_and_arena": ["stadium", "arena", "stadio"],
    "active_life-sports_and_recreation_venue-sports_center": ["sports centre", "centro sportivo"],
    "active_life-sports_and_recreation_venue-public_swimming_pool": ["swimming pool", "piscina"],
    "attractions_and_activities-playground": ["playground", "parco giochi"],
    "active_life-marina":                  ["marina", "porto"],
    "attractions_and_activities-beach":     ["beach", "spiaggia"],
    "attractions_and_activities-historic_site": ["historic site", "monument", "monumento"],
    "attractions_and_activities-winery":    ["winery", "cantina"],
    "attractions_and_activities-brewery":   ["brewery", "birrificio"],
    "arts_and_entertainment-arcade":       ["arcade", "sala giochi"],
    "arts_and_entertainment-casino":       ["casino", "casinò"],
    "arts_and_entertainment-music_school": ["music school", "scuola di musica"],
    "attractions_and_activities-library":   ["library", "biblioteca"],
    "art_studios":                         ["art studio", "studio d'arte"],

    # Health & Medical
    "health_and_medical-hospital":         ["hospital", "ospedale"],
    "health_and_medical-clinic_and_medical_center": ["clinic", "ambulatorio", "poliambulatorio"],
    "health_and_medical-dentist":          ["dentist", "dentista"],
    "health_and_medical-doctor":           ["doctor", "physician", "medico"],
    "pets-veterinarian":                   ["veterinary", "vet", "veterinario"],
    "health_and_medical-optician":         ["optician", "ottica"],
    "health_and_medical-physiotherapist":  ["physio", "physiotherapist", "fisioterapista"],
    "health_and_medical-pharmacy":         ["pharmacy", "farmacia"],
    "health_and_medical-diagnostic_lab":   ["laboratory", "laboratorio"],
    "health_and_medical-urgent_care":      ["urgent care", "pronto soccorso"],
    "health_and_medical-medical_supply":   ["medical supply", "dispositivi medici"],

    # Education & Public Services
    "education-college_university":        ["university", "college", "università", "ateneo", "politecnico"],
    "education-school":                    ["school", "scuola", "liceo", "istituto"],
    "education-specialty_school-driving_school": ["driving school", "autoscuola"],
    "education-school-preschool_and_kindergarten": ["kindergarten", "nursery", "asilo"],
    "public_service_and_government-post_office": ["post office", "ufficio postale", "poste italiane"],
    "public_service_and_government-police_station": ["police", "polizia", "carabinieri", "questura"],
    "public_service_and_government-fire_station": ["fire station", "vigili del fuoco"],
    "public_service_and_government-embassy":["embassy", "ambasciata"],
    "public_service_and_government-consulate":["consulate", "consolato"],
    "public_service_and_government-government_services-city_hall":["city hall", "municipio", "comune"],
    "public_service_and_government-courthouse":["courthouse", "tribunale"],
    "public_service_and_government-library":["library", "biblioteca"],

    # Finance & Real Estate
    "financial_service-bank_credit_union":["bank", "banca", "ATM", "bancomat"],
    "financial_service-insurance_agency":["insurance", "assicurazioni"],
    "real_estate-real_estate_agent_and_broker":["real estate", "agenzia immobiliare"],
    "real_estate-property_management":["property management", "gestione immobiliare"],
    "financial_service-ATM":["ATM", "bancomat"],
    "financial_service-stock_broker":["broker", "borsa"],

    # Transportation & Automotive
    "travel-airport":                      ["airport", "aeroporto"],
    "travel-transportation-rail_station":  ["train station", "stazione ferroviaria"],
    "travel-transportation-bus_station":   ["bus station", "autostazione"],
    "travel-transportation-bus_stop":      ["bus stop", "fermata autobus"],
    "travel-transportation-subway_station":["metro station", "subway station", "stazione metro"],
    "travel-transportation-taxi_limo_and_shuttle_service-taxi_stand":["taxi stand", "posteggio taxi"],
    "automotive-gas_station":             ["gas station", "petrol station", "distributore di benzina"],
    "travel-road_structures_and_services-parking":["parking", "parcheggio", "autorimessa"],
    "automotive-automotive_services_and_repair-car_wash_and_detail":["car wash", "autolavaggio"],
    "automotive-automotive_services_and_repair":["car repair", "mechanic", "officina", "carrozzeria"],
    "automotive-automotive_dealer":["car dealer", "concessionaria auto"],
    "automotive-automotive_parts_and_accessories-tire_shop":["tire shop", "gommista"],

    # Personal & Professional Services
    "beauty_and_spa-hair_salon":["hair salon", "parrucchiere", "barber"],
    "beauty_and_spa-beauty_salon":["beauty salon", "centro estetico", "istituto di bellezza"],
    "professional_services-laundry_services":["laundry", "lavanderia", "launderette"],
    "professional_services-funeral_services_and_cemeteries-funeral_service":["funeral home", "onoranze funebri"],
    "professional_services-funeral_services_and_cemeteries-cemetery":["cemetery", "cimitero"],
    "professional_services-dry_cleaning":["dry cleaning", "lavasecco"],
    "professional_services-pet_grooming":["pet grooming", "toelettatura animali"],
    "professional_services-photographer":["photographer", "fotografo"],
    "professional_services-legal_services":["lawyer", "avvocato", "studio legale"],
    "professional_services-accounting":["accountant", "commercialista"],

    # Recreation & Leisure
    "attractions_and_activities-park":["park", "parco", "giardino pubblico"],
    "attractions_and_activities-beach":["beach", "spiaggia"],
    "attractions_and_activities-hiking_trail":["trail", "sentiero"],
    "attractions_and_activities-campground":["campground", "campeggio"],
    "attractions_and_activities-ski_resort":["ski resort", "stazione sciistica"],

    # Repeat or synonym expansions until 150+ entries total...
}

In [None]:
RULES = [
    (
        lambda n, w, s, kws=keywords: any(kw in (n or "").lower() or kw in (w or "").lower() or kw in (s or "").lower() for kw in kws),
        category
    )
    for category, keywords in category_keywords.items()
]

In [None]:
def rule_based_category(poi_name, website_text, social_text):
    name = (poi_name or "").lower()
    web  = (website_text or "").lower()
    soc  = (social_text or "").lower()
    for test_fn, category in RULES:
        try:
            if test_fn(name, web, soc):
                return category
        except Exception:
            continue
    return None

# Apply the rules to all POIs in your scraped data
poi_rule_categories = {}
for poi, site_text in scraped_websites.items():
    # Combine all social scraped content for this POI into one string
    social_entries = scraped_socials_content.get(poi, [])
    combined_social = " ".join(next(iter(d.values()), "") for d in social_entries)
    poi_rule_categories[poi] = rule_based_category(poi, site_text, combined_social)

# Inspect the mapping of POI names to their rule-based categories
print(poi_rule_categories)

{'Parco Nicolas': 'attractions_and_activities-park', 'Edimol Sas': None, 'Casual Store money transfer': 'retail', 'Molino Dorino': None, 'Atlassib': None, 'CSC - Casa salvadorena Cultural': None, 'Prodent Italia': None, 'Saldoplast': None, 'Swiss DolorClast Italia': 'arts_and_entertainment-night_club', 'Mud Museum - Open Space': 'attractions_and_activities-museum', 'PR No Ordinary Training Studio': None, 'Multiprint': 'retail-clothing_and_accessories-clothing_store', 'LR Health & Beauty Italia': None, 'Team Russo': 'eat_and_drink-bar', 'Malià': 'automotive-automotive_services_and_repair', '5Min S&S': None, 'Tempocasa Pero': None, 'Pero': None, 'Autoscuola Pero S.a.s': 'education-school', 'Farmacia Vittoria': 'health_and_medical-pharmacy', 'Eurogreen Fiori': None, 'Western Union': None, 'Libri e Giardini': None, 'Pizzeria Sempione Da Lorenzo E Ivan | Pero': 'eat_and_drink-restaurant', 'Easyoga & Fun': 'eat_and_drink-bar', 'Laurita Michele': 'beauty_and_spa-hair_salon', 'Magistroni Aless

In [68]:
# Set up the API key
api_key = "AIzaSyBTTIggzhUhQVFZglOG1-PZF9xO3PRb-kY"

In [None]:
import google.generativeai as genai
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")
context = "You are a Category Suggestion Agent. Your job is to read a POI record that I provide you with that consists of the POI name, rule-based search results and data scraped from its website and socials. Now using this propose an category for it which is as detailed as possible (Outputing 'italian restaurant' instead of 'restaurant') using its name, rule based search and web/ socials scraped information, just categorize until you are sure about the category."+\
        f"These are all the categories in the schema in a JSON file that has all the categories in the form as a tree: {category_tree}." +\
        f"While categorizing, I expect you to traverse the category tree and categorize level by level. for example, if you are categorizing a restaurant, you should first categorize it as 'eat_and_drink', then as 'restaurant', and finally as 'italian restaurant' if applicable."+\
        "I am going to provide you with all the POIs names with the website scraped information, I will provided the names and website scraped information in this format: 'name': 'website scraped information'" +\
        f"These are the POIs with the website scraped information: {scraped_websites}" +\
        "I am also going to provide you with all the POIs names with the rule-based search results, I will provided the names and rule-based search results in this format: 'name': 'rule-based search category prediction'" +\
        f"These are the POIs with the rule-based search results: {poi_rule_categories}" +\
        "Finally, I am going to provide you with all the POIs names with the socials scraped information, I will provided the names and socials scraped information in this format: 'name': 'socials scraped information'" +\
        f"These are the POIs with the socials scraped information: {scraped_socials_string}" +\
        "Provide me with a category that best sutes the primary category for these POIs from the list of categories I provided you with. Choose as detailed as possible and ONLY choose the categories that you are mostly sure about"+\
        "The response should be in this format: 'name': 'predicted category & confidance percentage'. Be as honest as possible and if you are not sure about the category please say so. Do not provide any other information or explanation, just the POI's name and their predicted categories."+\
        "You do not need to provide it in a JSON format, just provide it in a string format, one line for each POI. Do not provide any other information or explanation, just the POI's name and their predicted categories."

In [70]:
# Generate content using the model
response = model.generate_content(context)
print(response.text)

Parco Nicolas: park & 70%
Edimol Sas: No predicted category
Casual Store money transfer: financial_service-money_transfer_services & 50%
Molino Dorino: No predicted category
Atlassib: No predicted category
CSC - Casa salvadorena Cultural: cultural_center & 60%
Prodent Italia: medical_supply-dental_supply_store & 50%
Saldoplast: business_manufacturing_and_supply-plastic_manufacturer & 50%
Swiss DolorClast Italia: health_and_medical-physical_therapy & 50%
Mud Museum - Open Space: attractions_and_activities-museum & 70%
PR No Ordinary Training Studio: active_life-sports_and_fitness_instruction-fitness_trainer & 50%
Multiprint: retail-arts_and_crafts-art_supply_store & 50%
LR Health & Beauty Italia: beauty_and_spa & 60%
Team Russo: real_estate_agent & 50%
Malià: sewing_and_alterations & 50%
5Min S&S: No predicted category
Tempocasa Pero: real_estate_agent & 50%
Pero: No predicted category
Autoscuola Pero S.a.s: education-driving_school & 70%
Farmacia Vittoria: health_and_medical-pharmacy &

In [71]:
# Assuming 'response' is the variable holding the LLM's output from the previous cell
# and response.text contains the string with predictions.

predicted_categories_string = "" # Initialize the string to store categories

if 'response' in locals() and hasattr(response, 'text'):
    llm_output_lines = response.text.split('\n')
    predicted_categories_only = []
    predicted_categories_list_for_string = [] # New list to build the string

    for line in llm_output_lines:
        if ':' in line:
            try:
                # Split by the first occurrence of ': ' to separate name from prediction
                parts = line.split(': ', 1)
                if len(parts) > 1:
                    prediction_part = parts[1].strip()
                    
                    # Remove potential surrounding quotes from the prediction part
                    if prediction_part.startswith("'") and prediction_part.endswith("'"):
                        prediction_part = prediction_part[1:-1]
                    
                    # Split the prediction part by ' & ' to separate category from confidence
                    category_confidence_split = prediction_part.split(' & ', 1)
                    category = category_confidence_split[0]
                    
                    # Further clean up if the category itself has quotes
                    if category.startswith("'") and category.endswith("'"):
                        category = category[1:-1]
                        
                    if category: # Modified condition
                        predicted_categories_only.append(category)
                        predicted_categories_list_for_string.append(category) # Add to new list
                        print(category)
            except Exception as e:
                print(f"Could not parse line: {line} - Error: {e}")
        elif line.strip(): # Handle cases where a line might just be a category
             # Further clean up if the category itself has quotes
            category = line.strip()
            if category.startswith("'") and category.endswith("'"):
                category = category[1:-1]
            if category: # Modified condition
                predicted_categories_only.append(category)
                predicted_categories_list_for_string.append(category) # Add to new list
                print(category)

    if not predicted_categories_only:
        print("No categories were extracted from the LLM response.")
    else:
        # Store categories in a newline-separated string
        predicted_categories_string = "\n".join(predicted_categories_list_for_string)
        print("\n--- Categories stored in string ---")
        print(predicted_categories_string)

else:
    print("LLM response object 'response' or 'response.text' not found.")

# The variable 'predicted_categories_string' now holds the desired string.
# You can print it to verify or use it for comparison later.
# print(f"\nFinal string for comparison:\n{predicted_categories_string}")


park
No predicted category
financial_service-money_transfer_services
No predicted category
No predicted category
cultural_center
medical_supply-dental_supply_store
business_manufacturing_and_supply-plastic_manufacturer
health_and_medical-physical_therapy
attractions_and_activities-museum
active_life-sports_and_fitness_instruction-fitness_trainer
retail-arts_and_crafts-art_supply_store
beauty_and_spa
real_estate_agent
sewing_and_alterations
No predicted category
real_estate_agent
No predicted category
education-driving_school
health_and_medical-pharmacy
retail-flowers_and_gifts_shop-florist
financial_service-money_transfer_services
retail-books_mags_music_and_video-bookstore
eat_and_drink-pizza_restaurant
active_life-yoga_studio
beauty_and_spa-hair_salon
No predicted category
arts_and_entertainment-dance_club
travel-transportation-metro_station
active_life-sports_club_and_league
education-music_school
No predicted category
it_service_and_computer_repair
retail-home_and_garden-hardware_s

In [72]:
# Iterate through the place_dataset and print the POI name and its primary category
if 'place_dataset' in locals() and isinstance(place_dataset, pandas.DataFrame) and not place_dataset.empty:
    print("POI Name: Primary Category")
    print("---------------------------")
    for index, poi_row in place_dataset.iterrows():
        primary_name = "N/A"
        if 'names' in poi_row and poi_row['names'] and isinstance(poi_row['names'], dict) and 'primary' in poi_row['names']:
            primary_name = poi_row['names']['primary']
        
        primary_category = "N/A"
        if 'categories' in poi_row and poi_row['categories'] and isinstance(poi_row['categories'], dict) and 'primary' in poi_row['categories']:
            primary_category = poi_row['categories']['primary']
            
        print(f'"{primary_name}": "{primary_category}"')
else:
    print("place_dataset is not available or is empty.")

POI Name: Primary Category
---------------------------
"Parco Nicolas": "park"
"Edimol Sas": "newspaper_and_magazines_store"
"Casual Store money transfer": "financial_service"
"Molino Dorino": "bus_station"
"Atlassib": "travel_services"
"CSC - Casa salvadorena Cultural": "cultural_center"
"Prodent Italia": "dentist"
"Saldoplast": "plastic_fabrication_company"
"Swiss DolorClast Italia": "advertising_agency"
"Mud Museum - Open Space": "museum"
"PR No Ordinary Training Studio": "life_coach"
"Multiprint": "business_manufacturing_and_supply"
"LR Health & Beauty Italia": "beauty_and_spa"
"Team Russo": "real_estate_agent"
"Malià": "sewing_and_alterations"
"5Min S&S": "fashion"
"Tempocasa Pero": "real_estate_service"
"Pero": "grocery_store"
"Autoscuola Pero S.a.s": "driving_school"
"Farmacia Vittoria": "vitamins_and_supplements"
"Eurogreen Fiori": "florist"
"Western Union": "money_transfer_services"
"Libri e Giardini": "holiday_rental_home"
"Pizzeria Sempione Da Lorenzo E Ivan | Pero": "pizza_

In [73]:
# Initialize an empty list to store primary categories
primary_categories_list = []

if 'place_dataset' in locals() and isinstance(place_dataset, pandas.DataFrame) and not place_dataset.empty:
    for index, poi_row in place_dataset.iterrows():
        primary_category = "N/A"
        if 'categories' in poi_row and poi_row['categories'] and isinstance(poi_row['categories'], dict) and 'primary' in poi_row['categories']:
            primary_category = poi_row['categories']['primary']
        primary_categories_list.append(primary_category)
    
    # Join the list into a newline-separated string
    primary_categories_string = "\n".join(primary_categories_list)
    print(primary_categories_string)
else:
    primary_categories_string = "" # Initialize as empty if dataset is not valid
    print("place_dataset is not available, not a DataFrame, or is empty. No categories stored.")

# The variable 'primary_categories_string' now holds the desired string.
# You can print it to verify or use it for comparison later.
# print(f"\nFinal string for comparison:\n{primary_categories_string}")

park
newspaper_and_magazines_store
financial_service
bus_station
travel_services
cultural_center
dentist
plastic_fabrication_company
advertising_agency
museum
life_coach
business_manufacturing_and_supply
beauty_and_spa
real_estate_agent
sewing_and_alterations
fashion
real_estate_service
grocery_store
driving_school
vitamins_and_supplements
florist
money_transfer_services
holiday_rental_home
pizza_restaurant
amateur_sports_team
beauty_salon
shopping
topic_concert_venue
transportation
active_life
music_school
active_life
computer_store
hardware_store
machine_shop
clothing_store
gas_station
discount_store
pet_store
pet_services
car_dealer
car_dealer
restaurant
bar
airport_shuttles
motorcycle_dealer
outlet_store
professional_services
rental_services
psychologist
health_and_medical
engine_repair_service
automotive_repair
event_planning
automotive_repair
professional_services
office_equipment
truck_dealer_for_businesses
flea_market
clothing_store
dance_club
rental_services
automotive_repair


In [74]:
# Ensure both strings are available
if 'primary_categories_string' in locals() and 'predicted_categories_string' in locals() \
    and isinstance(primary_categories_string, str) and isinstance(predicted_categories_string, str):

     primary_list = primary_categories_string.strip().split('\n')
     predicted_list = predicted_categories_string.strip().split('\n')

     # Remove any empty strings that might result from splitting if there are trailing newlines
     primary_list = [cat for cat in primary_list if cat]
     predicted_list = [cat for cat in predicted_list if cat]
     
     similar_count = 0
     total_compared = 0
     unmatched_llm_pois = [] # To store LLM predictions that didn't match
     unmatched_for_review = [] # To store (original, predicted) pairs for mismatches

     # Determine the number of comparisons to make (the shorter of the two lists)
     # This is important if the LLM didn't provide a prediction for every POI
     num_comparisons = min(len(primary_list), len(predicted_list))

     print(f"Comparing {num_comparisons} pairs of categories.")
     print("--- Comparison Results ---")
     print("Original Category  | Predicted Category | Match?")
     print("-------------------|--------------------|--------")

     for i in range(num_comparisons):
          original_cat = primary_list[i].strip()
          predicted_cat = predicted_list[i].strip()
          total_compared += 1
          
          match_status = "No"
          if original_cat == predicted_cat:
                similar_count += 1
                match_status = "Yes"
          else:
                # Store mismatches for review
                unmatched_for_review.append({'original': original_cat, 'predicted': predicted_cat, 'index_in_list': i})
                
          print(f"{original_cat:<20}| {predicted_cat:<20}| {match_status}")

     print(f"\nNumber of matching categories: {similar_count}")
     print(f"Total categories compared: {total_compared}")

     if total_compared > 0:
          accuracy = (similar_count / total_compared) * 100
          print(f"Accuracy: {accuracy:.2f}%")
     else:
          print("No categories were compared.")

     # Report if lists had different lengths
     if len(primary_list) != len(predicted_list):
          print(f"\nWarning: Original categories list had {len(primary_list)} entries, "
                  f"while predicted categories list had {len(predicted_list)} entries. "
                  f"Comparison was done for the first {num_comparisons} entries.")

     # Print details of mismatches for review
    #  if unmatched_for_review:
    #       print("\n--- Mismatched Categories for Review ---")
    #       for item in unmatched_for_review:
    #             print(f"Index: {item['index_in_list']}, Original: '{item['original']}', Predicted: '{item['predicted']}'")
    #  else:
    #       if total_compared > 0:
    #             print("\nAll compared categories matched!")

else:
     print("Error: 'primary_categories_string' or 'predicted_categories_string' not found or not a string.")
     print("Please ensure the previous cells have been executed correctly.")


Comparing 242 pairs of categories.
--- Comparison Results ---
Original Category  | Predicted Category | Match?
-------------------|--------------------|--------
park                | park                | Yes
newspaper_and_magazines_store| No predicted category| No
financial_service   | financial_service-money_transfer_services| No
bus_station         | No predicted category| No
travel_services     | No predicted category| No
cultural_center     | cultural_center     | Yes
dentist             | medical_supply-dental_supply_store| No
plastic_fabrication_company| business_manufacturing_and_supply-plastic_manufacturer| No
advertising_agency  | health_and_medical-physical_therapy| No
museum              | attractions_and_activities-museum| No
life_coach          | active_life-sports_and_fitness_instruction-fitness_trainer| No
business_manufacturing_and_supply| retail-arts_and_crafts-art_supply_store| No
beauty_and_spa      | beauty_and_spa      | Yes
real_estate_agent   | real_estate_agent

In [80]:
import google.generativeai as genai
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")
comparison_context = "Now you are category comparision agent, I will be providing you with a category that a software predicted for a POI and the actual category of the POI."+\
        "Now I want you to compare each POI's predicted and actual category and let me know if they are similar or not. For example active_life and sports_and_recreation_venue are similar."+\
        f"These are the categories that the software predicted: {predicted_categories_string}"+\
        f"These are the actual categories of the POIs: {primary_categories_string}"+\
        "if they are similar then just say 'yes' otherwise say 'no'. Do not provide any other information or explanation, just the answer. Do not provide any other information or explanation, just the answer which is a yes or a no."+\
        "Again I don't want you to provide the POI names or any other information, just the answer which is a yes or a no. Do not provide any other information or explanation, just the answer which is a yes or a no."


In [81]:
# Generate content using the model
Comparision = model.generate_content(comparison_context)
print(Comparision.text)
# Calculate and print the percentage of "yes" answers
if 'Comparision' in locals() and hasattr(Comparision, 'text') and Comparision.text:
    answers = Comparision.text.strip().split('\n')
    answers = [ans.strip().lower() for ans in answers if ans.strip()] # Clean and normalize answers
    
    total_answers = len(answers)
    yes_answers = answers.count('yes')
    
    if total_answers > 0:
        yes_percentage = (yes_answers / total_answers) * 100
        print(f"\nTotal 'yes' answers: {yes_answers}")
        print(f"Total answers from LLM: {total_answers}")
        print(f"Percentage of 'yes' answers: {yes_percentage:.2f}%")
    else:
        print("\nNo answers found in LLM Comparision output to calculate statistics.")
else:
    print("\nLLM Comparision output 'Comparision.text' not found or is empty.")


yes
no
yes
no
no
yes
no
no
no
yes
no
no
yes
yes
yes
no
no
no
yes
no
yes
yes
yes
yes
no
no
no
yes
yes
no
no
yes
yes
no
yes
yes
yes
yes
yes
yes
yes
yes
no
yes
yes
yes
yes
no
yes
no
no
yes
no
yes
no
no
yes
yes
no
no
no
yes
yes
yes
no
yes
yes
no
yes
no
no
no
no
yes
no
no
no
yes
no
yes
yes
yes
yes
yes
yes
no
yes
yes
yes
yes
yes
no
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
no
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
no
yes
yes


Total 'yes' answers: 97
Total answers from LLM: 139
Percentage of 'yes' answers: 69.78%
