In [None]:
%pip install overturemaps lonboard geopandas shapely dspy
%pip install google.generativeai


In [None]:
%cd Helper-functions
%run category-tree.py
%cd ..

In [None]:
import overturemaps as om
from overturemaps import core
import overturemaps
import pandas
import geopandas as gpd
from shapely import wkb
from lonboard import Map, PolygonLayer, ScatterplotLayer
import ipywidgets as widgets
import numpy as np
from IPython.display import display
import json
import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
import tqdm
import dspy
import os

In [None]:
with open('Data/category_tree.json', 'r') as f:
    category_tree = json.load(f)

In [None]:
def create_map(dataset):
    layer = ScatterplotLayer.from_geopandas(
        dataset,
        get_fill_color=[255, 0, 0],
        radius_min_pixels=5,
    )

    view_state = {
        "longitude": (bbox[0] + bbox[2]) / 2,
        "latitude": (bbox[1] + bbox[3]) / 2,
        "zoom": 8,
        "pitch": 45,
    }
    m = Map(layer, view_state=view_state)
    return m

In [None]:
def get_most_detailed_category(current_category, poi_metadata, tree):
    # Base Case: No further sub-categories
    if current_category not in tree or not tree[current_category]:
        return current_category  

    sub_categories = tree[current_category]

    # Example Matching Logic: Keyword search in POI name/description
    for sub_cat in sub_categories:
        if sub_cat.replace('-', ' ') in poi_metadata.lower():
            return get_most_detailed_category(sub_cat, poi_metadata, tree)

    # If no match found, return current category
    return current_category

In [None]:
def get_first_website(websites_list):
    if websites_list and isinstance(websites_list, list) and len(websites_list) > 0 and websites_list[0] and isinstance(websites_list[0], str):
        return websites_list[0]
    return ""

In [None]:
def scrape_website(url, timeout = 5):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.title.string if soup.title else 'No title found'
        meta_desc =  ""
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag:
            meta_desc = meta_tag.get('content', '')

        h1_tag = [tag.get_text(strip=True) for tag in soup.find_all('h1')]

        screaped_text = f"Title: {title}. Meta Description: {meta_desc}. Heading: {';'.join(h1_tag)}"

        return screaped_text
    
    except Exception as e:
        print(f"Failed to Scrape: {url}: {e}")
        return ""

In [None]:
# specify bounding box
bbox = 9.0894, 45.4942, 9.2594, 45.5542

In [None]:
place_dataset = core.geodataframe("place", bbox=bbox)
print(place_dataset.shape)

In [None]:
# Read the categories.txt file
with open('Data/categories.txt', 'r') as file:
    categories_data = file.readlines()
categories_data = categories_data[1:]

# Split the data by semicolons and extract the part before the semi-colon
categories_list = [entry.split(';')[0] for entry in categories_data]

In [None]:
# Pulling out the places and websites
websites = {}
number = 0
for i in range(len(place_dataset.id)):
    # print(f"Place: {place_dataset.names[i]['primary']}")
    number += 1
    websites[place_dataset.names[i]['primary']] = place_dataset.websites[i]

# Save all websites as a comma-separated string
websites_str = " , ".join(
    site[0] if site is not None and len(site) > 0 else "No website Found"
    for site in websites.values()
)

# Test Features

# print(websites_str)
print(number)

In [None]:
# Pulling out the places' social media links
socials_dict = {}
for i in range(len(place_dataset.id)):
    place_name = place_dataset.names[i]['primary']
    # Assuming place_dataset.socials[i] contains the social media information
    # This might be a list of strings, a dictionary, or None
    social_links = place_dataset.socials[i]
    socials_dict[place_name] = social_links

# Print the extracted social media links
# print(socials_dict) 

# Example: Print socials for the first few places to check the structure
# count = 0
# for place, links in socials_dict.items():
#     if count < 5: # Print for the first 5 places
#         print(f"Place: {place}, Socials: {links}")
#         count += 1
#     else:
#         break

# If you want a string representation similar to websites_str:
# This part depends heavily on the actual structure of place_dataset.socials[i]
# Assuming it's a list of strings (URLs) or can be easily converted to a string:
socials_str_list = []
for place_name, social_links_list in socials_dict.items():
    if social_links_list:
        # If social_links_list is a list of strings:
        if isinstance(social_links_list, list) and all(isinstance(link, str) for link in social_links_list):
            links_str = ", ".join(social_links_list)
        # If social_links_list is a dictionary (e.g., {'facebook': 'url', 'instagram': 'url'}):
        elif isinstance(social_links_list, dict):
            links_str = ", ".join([f"{platform}: {url}" for platform, url in social_links_list.items()])
        # Add more conditions if other structures are possible or convert to string directly
        else:
            links_str = str(social_links_list) # Fallback to string conversion
        socials_str_list.append(f"{place_name}: [{links_str}]")
    else:
        socials_str_list.append(f"{place_name}: No socials found")

# print("\nFormatted Socials String:")
# for item in socials_str_list[:5]: # Print for the first 5 places
#     print(item)

In [None]:
scraped_socials_content = {}
tasks_for_social_scraping = []

# Prepare tasks for scraping
for place_name, social_links_list in socials_dict.items():
    if social_links_list is not None:
        # Ensure social_links_list is iterable (e.g., numpy array or list)
        if isinstance(social_links_list, (list, np.ndarray)):
            for social_url in social_links_list:
                if isinstance(social_url, str) and social_url.startswith(('http://', 'https://')):
                    tasks_for_social_scraping.append((place_name, social_url))
        elif isinstance(social_links_list, str) and social_links_list.startswith(('http://', 'https://')): # Handle case where it might be a single string
            tasks_for_social_scraping.append((place_name, social_links_list))


total_social_urls = len(tasks_for_social_scraping)
scraped_social_urls_count = 0
progress_step_social = max(1, total_social_urls // 20) if total_social_urls > 0 else 1

def scrape_social_url_wrapper(args):
    global scraped_social_urls_count
    place_name, url = args
    scraped_content = scrape_website(url) # Using the existing scrape_website function
    
    # Increment and print progress
    scraped_social_urls_count += 1
    if total_social_urls > 0 and (scraped_social_urls_count % progress_step_social == 0 or scraped_social_urls_count == total_social_urls):
        percent = (scraped_social_urls_count / total_social_urls) * 100
        print(f"Scraped {scraped_social_urls_count}/{total_social_urls} ({percent:.1f}%) social URLs.")
        
    return place_name, url, scraped_content if scraped_content else "Scraping Failed or No Content"

print(f"Found {total_social_urls} social URLs to scrape.")

# Use ThreadPoolExecutor for concurrent scraping
with concurrent.futures.ThreadPoolExecutor(max_workers=3000) as executor: # Adjusted max_workers for social media sites
    social_results = list(executor.map(scrape_social_url_wrapper, tasks_for_social_scraping))

# Store the results
for place_name, url, content in social_results:
    if place_name not in scraped_socials_content:
        scraped_socials_content[place_name] = []
    scraped_socials_content[place_name].append({url: content})

print("\nScraping of social URLs complete.")
print("Scraped Social Media Content:")
# Print the scraped social media content (first few for brevity)
count_printed = 0
for place, contents in scraped_socials_content.items():
    if count_printed < 10: # Limit printing for brevity
        print(f"\nPlace: {place}")
        for content_dict in contents:
            for url_key, text_content in content_dict.items():
                print(f"  URL: {url_key}")
                print(f"  Content: {text_content[:200]}...") # Print first 200 chars
        count_printed +=1
    else:
        print(f"\n... and {len(scraped_socials_content) - count_printed} more places.")
        break
        
if not scraped_socials_content:
    print("No social media content was scraped.")

# New section to print results in the requested format
print("\n--- Formatted Scraped Socials Information (Name: Scraped Content from Socials) ---")
formatted_social_info_output_list = []

if isinstance(scraped_socials_content, dict):
    for place_name_key, list_of_content_dicts in scraped_socials_content.items():
        # list_of_content_dicts is a list of dicts, e.g., [{'url1': 'scraped_text1'}, {'url2': 'scraped_text2'}]
        
        all_scraped_texts_for_this_place = []
        for single_url_content_dict in list_of_content_dicts:
            # single_url_content_dict is e.g., {'url1': 'scraped_text1'}
            # We need the values (the scraped text strings)
            for scraped_text_item in single_url_content_dict.values():
                if scraped_text_item and isinstance(scraped_text_item, str): # Ensure it's a non-empty string
                    all_scraped_texts_for_this_place.append(scraped_text_item)
        
        # Concatenate all scraped texts for this place, separated by " | "
        # This aggregated_social_info is the "socials-scraped-Information"
        aggregated_social_info = " | ".join(all_scraped_texts_for_this_place)
        
        # Escape double quotes in place_name_key and aggregated_social_info to ensure valid string literals in the output
        place_name_escaped = str(place_name_key).replace('"', '\\"')
        aggregated_social_info_escaped = aggregated_social_info.replace('"', '\\"')
        
        formatted_line = f'"{place_name_escaped}": "{aggregated_social_info_escaped}"'
        print(formatted_line)
        formatted_social_info_output_list.append(formatted_line)

    if not formatted_social_info_output_list:
        print("No social media content was available in scraped_socials_content to format.")
else:
    print("Variable 'scraped_socials_content' is not a dictionary. Cannot produce formatted social info.")

In [None]:
# Join the list of formatted social info strings into a single comma-separated string
scraped_socials_string = ", ".join(formatted_social_info_output_list)
print(scraped_socials_string)

In [None]:
# Create a string with "place: website" for each entry
place_website_str = " , ".join(
    f"{place}: {websites[place][0] if websites[place] is not None and len(websites[place]) > 0 else 'No website Found'}"
    for place in websites
)

In [None]:
scraped_count = 0
total_websites = len(websites)
progress_step = max(1, total_websites // 20)  # 5% step

def scrape_site_wrapper(args):
    global scraped_count
    place, site = args
    if site is not None and len(site) > 0 and site[0] != "No website Found":
        try:
            scraped_content = scrape_website(site[0])
            result = (place, scraped_content if scraped_content else "Scraping Failed")
        except Exception:
            result = (place, "Scraping Failed")
    else:
        result = (place, "No website Found")
    scraped_count += 1
    if scraped_count % progress_step == 0 or scraped_count == total_websites:
        percent = (scraped_count / total_websites) * 100
        print(f"{scraped_count}/{total_websites} ({percent:.1f}%) websites scraped")
    return result

with concurrent.futures.ThreadPoolExecutor(max_workers=3000) as executor:
    results = list(executor.map(scrape_site_wrapper, websites.items()))

scraped_websites = dict(results)
print(scraped_websites)


In [None]:
RULES = [
    # Lodging -> accommodation
    (lambda name, websites, social_content: ("hotel" in name.lower() or "albergo" in name.lower()) or \
                                           (social_content and ("hotel" in social_content.lower() or "albergo" in social_content.lower())), "accommodation-hotel"),
    (lambda name, websites, social_content: ("motel" in name.lower()) or \
                                           (social_content and "motel" in social_content.lower()), "accommodation-motel"),
    (lambda name, websites, social_content: ("hostel" in name.lower() or "ostello" in name.lower()) or \
                                           (social_content and ("hostel" in social_content.lower() or "ostello" in social_content.lower())), "accommodation-hostel"),
    (lambda name, websites, social_content: ("resort" in name.lower()) or \
                                           (social_content and "resort" in social_content.lower()), "accommodation-resort"),
    (lambda name, websites, social_content: ("bed and breakfast" in name.lower() or "b&b" in name.lower()) or \
                                           (social_content and ("bed and breakfast" in social_content.lower() or "b&b" in social_content.lower())), "accommodation-bed_and_breakfast"),

    # Food and Drink -> eat_and_drink or retail-food
    (lambda name, websites, social_content: ("pizzeria" in name.lower()) or \
                                           (social_content and "pizzeria" in social_content.lower()), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("sushi" in name.lower()) or \
                                           (social_content and "sushi" in social_content.lower()), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("kebab" in name.lower()) or \
                                           (social_content and "kebab" in social_content.lower()), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("trattoria" in name.lower()) or \
                                           (social_content and "trattoria" in social_content.lower()), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("osteria" in name.lower()) or \
                                           (social_content and "osteria" in social_content.lower()), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("restaurant" in name.lower() or "ristorante" in name.lower()) or \
                                           (social_content and ("restaurant" in social_content.lower() or "ristorante" in social_content.lower())), "eat_and_drink-restaurant"),
    (lambda name, websites, social_content: ("cafe" in name.lower() or "caffè" in name.lower() or "coffee shop" in name.lower() or "caffetteria" in name.lower()) or \
                                           (social_content and ("cafe" in social_content.lower() or "caffè" in social_content.lower() or "coffee shop" in social_content.lower() or "caffetteria" in social_content.lower())), "eat_and_drink-cafe"),
    (lambda name, websites, social_content: ("bar" in name.lower() or "pub" in name.lower()) or \
                                           (social_content and ("bar" in social_content.lower() or "pub" in social_content.lower())), "eat_and_drink-bar"),
    (lambda name, websites, social_content: ("bakery" in name.lower() or "panificio" in name.lower() or "forno" in name.lower()) or \
                                           (social_content and ("bakery" in social_content.lower() or "panificio" in social_content.lower() or "forno" in social_content.lower())), "retail-food-bakery"),
    (lambda name, websites, social_content: ("ice cream" in name.lower() or "gelateria" in name.lower()) or \
                                           (social_content and ("ice cream" in social_content.lower() or "gelateria" in social_content.lower())), "retail-food-ice_cream_shop"),
    (lambda name, websites, social_content: ("pastry shop" in name.lower() or "pasticceria" in name.lower()) or \
                                           (social_content and ("pastry shop" in social_content.lower() or "pasticceria" in social_content.lower())), "retail-food-pastry_and_cake_shop"),
    (lambda name, websites, social_content: ("wine bar" in name.lower() or ("enoteca" in name.lower() and "shop" not in name.lower())) or \
                                           (social_content and ("wine bar" in social_content.lower() or ("enoteca" in social_content.lower() and "shop" not in social_content.lower()))), "eat_and_drink-bar-wine_bar"),

    # Retail/Shop -> retail
    (lambda name, websites, social_content: ("supermarket" in name.lower() or "grocery" in name.lower() or "ipermercato" in name.lower()) or \
                                           (social_content and ("supermarket" in social_content.lower() or "grocery" in social_content.lower() or "ipermercato" in social_content.lower())), "retail-food-supermarket_and_hypermarket"),
    (lambda name, websites, social_content: ("bookstore" in name.lower() or "librairie" in name.lower() or "libreria" in name.lower()) or \
                                           (social_content and ("bookstore" in social_content.lower() or "librairie" in social_content.lower() or "libreria" in social_content.lower())), "retail-books_stationery_music_and_film-book_shop"),
    (lambda name, websites, social_content: ("clothing store" in name.lower() or "fashion" in name.lower() or "abbigliamento" in name.lower()) or \
                                           (social_content and ("clothing store" in social_content.lower() or "fashion" in social_content.lower() or "abbigliamento" in social_content.lower())), "retail-clothing_and_accessories-clothing_store"),
    (lambda name, websites, social_content: ("pharmacy" in name.lower() or "farmacia" in name.lower()) or \
                                           (social_content and ("pharmacy" in social_content.lower() or "farmacia" in social_content.lower())), "health_and_medical-pharmacy"), # Also health
    (lambda name, websites, social_content: ("electronics store" in name.lower() or "elettronica" in name.lower()) or \
                                           (social_content and ("electronics store" in social_content.lower() or "elettronica" in social_content.lower())), "retail-electronics-consumer_electronics_store"),
    (lambda name, websites, social_content: ("hardware store" in name.lower() or "ferramenta" in name.lower()) or \
                                           (social_content and ("hardware store" in social_content.lower() or "ferramenta" in social_content.lower())), "retail-home_and_garden-hardware_store"),
    (lambda name, websites, social_content: ("department store" in name.lower() or "grande magazzino" in name.lower()) or \
                                           (social_content and ("department store" in social_content.lower() or "grande magazzino" in social_content.lower())), "retail-department_store"),
    (lambda name, websites, social_content: ("mall" in name.lower() or "shopping center" in name.lower() or "centro commerciale" in name.lower()) or \
                                           (social_content and ("mall" in social_content.lower() or "shopping center" in social_content.lower() or "centro commerciale" in social_content.lower())), "retail-shopping_center_and_mall"),
    (lambda name, websites, social_content: ("flower shop" in name.lower() or "florist" in name.lower() or "fioraio" in name.lower()) or \
                                           (social_content and ("flower shop" in social_content.lower() or "florist" in social_content.lower() or "fioraio" in social_content.lower())), "retail-home_and_garden-florist"),
    (lambda name, websites, social_content: ("jewelry" in name.lower() or "gioielleria" in name.lower()) or \
                                           (social_content and ("jewelry" in social_content.lower() or "gioielleria" in social_content.lower())), "retail-clothing_and_accessories-jewelry_and_watch_store"),
    (lambda name, websites, social_content: ("shoe store" in name.lower() or "calzature" in name.lower()) or \
                                           (social_content and ("shoe store" in social_content.lower() or "calzature" in social_content.lower())), "retail-clothing_and_accessories-shoe_store"),
    (lambda name, websites, social_content: ("toy store" in name.lower() or "giocattoli" in name.lower()) or \
                                           (social_content and ("toy store" in social_content.lower() or "giocattoli" in social_content.lower())), "retail-toys_and_games_store"),
    (lambda name, websites, social_content: ("optician" in name.lower() or "ottica" in name.lower()) or \
                                           (social_content and ("optician" in social_content.lower() or "ottica" in social_content.lower())), "health_and_medical-optician"), # Also health
    (lambda name, websites, social_content: ("butcher" in name.lower() or "macelleria" in name.lower()) or \
                                           (social_content and ("butcher" in social_content.lower() or "macelleria" in social_content.lower())), "retail-food-butcher_shop"),
    (lambda name, websites, social_content: ("wine shop" in name.lower() or "enoteca" in name.lower()) or \
                                           (social_content and ("wine shop" in social_content.lower() or "enoteca" in social_content.lower())), "retail-beverage_store-wine_and_spirits_store"),
    (lambda name, websites, social_content: ("newsstand" in name.lower() or "edicola" in name.lower()) or \
                                           (social_content and ("newsstand" in social_content.lower() or "edicola" in social_content.lower())), "retail-books_stationery_music_and_film-newsagent_and_kiosk"),
    (lambda name, websites, social_content: ("tobacco shop" in name.lower() or "tabaccheria" in name.lower() or "tabacchi" in name.lower()) or \
                                           (social_content and ("tobacco shop" in social_content.lower() or "tabaccheria" in social_content.lower() or "tabacchi" in social_content.lower())), "retail-tobacconist"),
    (lambda name, websites, social_content: ("shop" in name.lower() or "store" in name.lower() or "market" in name.lower() or "boutique" in name.lower() or "emporio" in name.lower() or "negozio" in name.lower()) or \
                                           (social_content and ("shop" in social_content.lower() or "store" in social_content.lower() or "market" in social_content.lower() or "boutique" in social_content.lower() or "emporio" in social_content.lower() or "negozio" in social_content.lower())), "retail"),

    # Arts & Entertainment -> arts_and_entertainment or attractions_and_activities
    (lambda name, websites, social_content: ("gallery" in name.lower() or "galleria d'arte" in name.lower()) or \
                                           (social_content and ("gallery" in social_content.lower() or "galleria d'arte" in social_content.lower())), "attractions_and_activities-art_gallery"),
    (lambda name, websites, social_content: ("museum" in name.lower() or "museo" in name.lower() or (get_first_website(websites) and get_first_website(websites).endswith(".museum"))) or \
                                           (social_content and ("museum" in social_content.lower() or "museo" in social_content.lower())), "attractions_and_activities-museum"),
    (lambda name, websites, social_content: ("cinema" in name.lower() or "movie theater" in name.lower()) or \
                                           (social_content and ("cinema" in social_content.lower() or "movie theater" in social_content.lower())), "arts_and_entertainment-movie_theater"),
    (lambda name, websites, social_content: (("theater" in name.lower() or "teatro" in name.lower()) and "movie" not in name.lower() and "cinema" not in name.lower()) or \
                                           (social_content and (("theater" in social_content.lower() or "teatro" in social_content.lower()) and "movie" not in social_content.lower() and "cinema" not in social_content.lower())), "arts_and_entertainment-performing_arts_theater"),
    (lambda name, websites, social_content: ("music venue" in name.lower() or "concert hall" in name.lower()) or \
                                           (social_content and ("music venue" in social_content.lower() or "concert hall" in social_content.lower())), "arts_and_entertainment-topic_concert_venue"),
    (lambda name, websites, social_content: ("nightclub" in name.lower() or "disco" in name.lower() or "discoteca" in name.lower()) or \
                                           (social_content and ("nightclub" in social_content.lower() or "disco" in social_content.lower() or "discoteca" in social_content.lower())), "arts_and_entertainment-night_club"),

    # Health -> health_and_medical or pets
    (lambda name, websites, social_content: ("hospital" in name.lower() or "ospedale" in name.lower()) or \
                                           (social_content and ("hospital" in social_content.lower() or "ospedale" in social_content.lower())), "health_and_medical-hospital"),
    (lambda name, websites, social_content: ("clinic" in name.lower() or "ambulatorio" in name.lower() or "poliambulatorio" in name.lower()) or \
                                           (social_content and ("clinic" in social_content.lower() or "ambulatorio" in social_content.lower() or "poliambulatorio" in social_content.lower())), "health_and_medical-clinic_and_medical_center"),
    (lambda name, websites, social_content: ("dentist" in name.lower() or "dental" in name.lower() or "dentista" in name.lower() or "studio dentistico" in name.lower()) or \
                                           (social_content and ("dentist" in social_content.lower() or "dental" in social_content.lower() or "dentista" in social_content.lower() or "studio dentistico" in social_content.lower())), "health_and_medical-dentist"),
    (lambda name, websites, social_content: ("doctor" in name.lower() or "physician" in name.lower() or "medico" in name.lower()) or \
                                           (social_content and ("doctor" in social_content.lower() or "physician" in social_content.lower() or "medico" in social_content.lower())), "health_and_medical-doctor"),
    (lambda name, websites, social_content: ("veterinary" in name.lower() or "vet" in name.lower() or "veterinario" in name.lower()) or \
                                           (social_content and ("veterinary" in social_content.lower() or "vet" in social_content.lower() or "veterinario" in social_content.lower())), "pets-veterinarian"),

    # Education -> education or public_service_and_government
    (lambda name, websites, social_content: ("university" in name.lower() or "college" in name.lower() or "università" in name.lower() or "politecnico" in name.lower() or "ateneo" in name.lower()) or \
                                           (social_content and ("university" in social_content.lower() or "college" in social_content.lower() or "università" in social_content.lower() or "politecnico" in social_content.lower() or "ateneo" in social_content.lower())), "education-college_university"),
    (lambda name, websites, social_content: ("school" in name.lower() or "scuola" in name.lower() or "istituto" in name.lower() or "liceo" in name.lower()) or \
                                           (social_content and ("school" in social_content.lower() or "scuola" in social_content.lower() or "istituto" in social_content.lower() or "liceo" in social_content.lower())), "education-school"),
    (lambda name, websites, social_content: ("library" in name.lower() or "biblioteca" in name.lower()) or \
                                           (social_content and ("library" in social_content.lower() or "biblioteca" in social_content.lower())), "public_service_and_government-community_services-library"),
    (lambda name, websites, social_content: ("kindergarten" in name.lower() or "nursery" in name.lower() or "asilo" in name.lower() or "scuola materna" in name.lower()) or \
                                           (social_content and ("kindergarten" in social_content.lower() or "nursery" in social_content.lower() or "asilo" in social_content.lower() or "scuola materna" in social_content.lower())), "education-school-preschool_and_kindergarten"),
    (lambda name, websites, social_content: ("driving school" in name.lower() or "autoscuola" in name.lower()) or \
                                           (social_content and ("driving school" in social_content.lower() or "autoscuola" in social_content.lower())), "education-specialty_school-driving_school"),

    # Financial -> financial_service
    (lambda name, websites, social_content: ("bank" in name.lower() or "banca" in name.lower()) or \
                                           (social_content and ("bank" in social_content.lower() or "banca" in social_content.lower())), "financial_service-bank_credit_union"),
    (lambda name, websites, social_content: ("atm" in name.lower() or "bancomat" in name.lower()) or \
                                           (social_content and ("atm" in social_content.lower() or "bancomat" in social_content.lower())), "financial_service-atms"),
    (lambda name, websites, social_content: ("insurance" in name.lower() or "assicurazioni" in name.lower()) or \
                                           (social_content and ("insurance" in social_content.lower() or "assicurazioni" in social_content.lower())), "financial_service-insurance_agency"),

    # Place of Worship -> religious_organization
    (lambda name, websites, social_content: ("church" in name.lower() or "chiesa" in name.lower() or "parrocchia" in name.lower() or "duomo" in name.lower() or "cattedrale" in name.lower() or "basilica" in name.lower()) or \
                                           (social_content and ("church" in social_content.lower() or "chiesa" in social_content.lower() or "parrocchia" in social_content.lower() or "duomo" in social_content.lower() or "cattedrale" in social_content.lower() or "basilica" in social_content.lower())), "religious_organization-church_cathedral"),
    (lambda name, websites, social_content: ("mosque" in name.lower() or "masjid" in name.lower() or "moschea" in name.lower()) or \
                                           (social_content and ("mosque" in social_content.lower() or "masjid" in social_content.lower() or "moschea" in social_content.lower())), "religious_organization-mosque"),
    (lambda name, websites, social_content: ("temple" in name.lower() or "tempio" in name.lower()) or \
                                           (social_content and ("temple" in social_content.lower() or "tempio" in social_content.lower())), "religious_organization-temple"),
    (lambda name, websites, social_content: ("synagogue" in name.lower() or "sinagoga" in name.lower()) or \
                                           (social_content and ("synagogue" in social_content.lower() or "sinagoga" in social_content.lower())), "religious_organization-synagogue"),

    # Transportation -> travel or automotive
    (lambda name, websites, social_content: ("airport" in name.lower() or "aeroporto" in name.lower()) or \
                                           (social_content and ("airport" in social_content.lower() or "aeroporto" in social_content.lower())), "travel-airport"),
    (lambda name, websites, social_content: ("train station" in name.lower() or "stazione ferroviaria" in name.lower()) or \
                                           (social_content and ("train station" in social_content.lower() or "stazione ferroviaria" in social_content.lower())), "travel-transportation-rail_station"),
    (lambda name, websites, social_content: ("bus station" in name.lower() or "autostazione" in name.lower()) or \
                                           (social_content and ("bus station" in social_content.lower() or "autostazione" in social_content.lower())), "travel-transportation-bus_station"),
    (lambda name, websites, social_content: ("bus stop" in name.lower() or "fermata autobus" in name.lower()) or \
                                           (social_content and ("bus stop" in social_content.lower() or "fermata autobus" in social_content.lower())), "travel-transportation-bus_stop"),
    (lambda name, websites, social_content: ("metro station" in name.lower() or "subway station" in name.lower() or "stazione metro" in name.lower()) or \
                                           (social_content and ("metro station" in social_content.lower() or "subway station" in social_content.lower() or "stazione metro" in social_content.lower())), "travel-transportation-subway_station"),
    (lambda name, websites, social_content: ("taxi stand" in name.lower() or "posteggio taxi" in name.lower()) or \
                                           (social_content and ("taxi stand" in social_content.lower() or "posteggio taxi" in social_content.lower())), "travel-transportation-taxi_limo_and_shuttle_service-taxi_stand"),
    (lambda name, websites, social_content: ("gas station" in name.lower() or "petrol station" in name.lower() or "distributore di benzina" in name.lower() or "stazione di servizio" in name.lower()) or \
                                           (social_content and ("gas station" in social_content.lower() or "petrol station" in social_content.lower() or "distributore di benzina" in social_content.lower() or "stazione di servizio" in social_content.lower())), "automotive-gas_station"),
    (lambda name, websites, social_content: ("parking" in name.lower() or "parcheggio" in name.lower() or "autorimessa" in name.lower()) or \
                                           (social_content and ("parking" in social_content.lower() or "parcheggio" in social_content.lower() or "autorimessa" in social_content.lower())), "travel-road_structures_and_services-parking"),

    # Public Service & Government -> public_service_and_government
    (lambda name, websites, social_content: ("post office" in name.lower() or "ufficio postale" in name.lower() or "poste italiane" in name.lower()) or \
                                           (social_content and ("post office" in social_content.lower() or "ufficio postale" in social_content.lower() or "poste italiane" in social_content.lower())), "public_service_and_government-post_office"),
    (lambda name, websites, social_content: ("police" in name.lower() or "polizia" in name.lower() or "carabinieri" in name.lower() or "questura" in name.lower()) or \
                                           (social_content and ("police" in social_content.lower() or "polizia" in social_content.lower() or "carabinieri" in social_content.lower() or "questura" in social_content.lower())), "public_service_and_government-law_enforcement-police_station"),
    (lambda name, websites, social_content: ("fire station" in name.lower() or "vigili del fuoco" in name.lower()) or \
                                           (social_content and ("fire station" in social_content.lower() or "vigili del fuoco" in social_content.lower())), "public_service_and_government-law_enforcement-fire_station"),
    (lambda name, websites, social_content: ("embassy" in name.lower() or "ambasciata" in name.lower()) or \
                                           (social_content and ("embassy" in social_content.lower() or "ambasciata" in social_content.lower())), "public_service_and_government-embassy"),
    (lambda name, websites, social_content: ("consulate" in name.lower() or "consolato" in name.lower()) or \
                                           (social_content and ("consulate" in social_content.lower() or "consolato" in social_content.lower())), "public_service_and_government-government_services-consulate"),
    (lambda name, websites, social_content: ("city hall" in name.lower() or "town hall" in name.lower() or "municipio" in name.lower() or "comune" in name.lower()) or \
                                           (social_content and ("city hall" in social_content.lower() or "town hall" in social_content.lower() or "municipio" in social_content.lower() or "comune" in social_content.lower())), "public_service_and_government-government_services-city_hall"),
    (lambda name, websites, social_content: ("courthouse" in name.lower() or "tribunale" in name.lower()) or \
                                           (social_content and ("courthouse" in social_content.lower() or "tribunale" in social_content.lower())), "public_service_and_government-courthouse"),

    # Recreation & Sport -> attractions_and_activities or active_life
    (lambda name, websites, social_content: ("park" in name.lower() or "parco" in name.lower() or "giardino pubblico" in name.lower() or "giardini" in name.lower()) or \
                                           (social_content and ("park" in social_content.lower() or "parco" in social_content.lower() or "giardino pubblico" in social_content.lower() or "giardini" in social_content.lower())), "attractions_and_activities-park"),
    (lambda name, websites, social_content: ("gym" in name.lower() or "fitness" in name.lower() or "palestra" in name.lower()) or \
                                           (social_content and ("gym" in social_content.lower() or "fitness" in social_content.lower() or "palestra" in social_content.lower())), "active_life-sports_and_recreation_venue-gym_and_fitness_center"),
    (lambda name, websites, social_content: ("stadium" in name.lower() or "arena" in name.lower() or "stadio" in name.lower()) or \
                                           (social_content and ("stadium" in social_content.lower() or "arena" in social_content.lower() or "stadio" in social_content.lower())), "active_life-sports_and_recreation_venue-stadium_and_arena"),
    (lambda name, websites, social_content: ("sports centre" in name.lower() or "centro sportivo" in name.lower() or "palazzetto dello sport" in name.lower()) or \
                                           (social_content and ("sports centre" in social_content.lower() or "centro sportivo" in social_content.lower() or "palazzetto dello sport" in social_content.lower())), "active_life-sports_and_recreation_venue-sports_center"),
    (lambda name, websites, social_content: ("swimming pool" in name.lower() or "piscina" in name.lower()) or \
                                           (social_content and ("swimming pool" in social_content.lower() or "piscina" in social_content.lower())), "active_life-sports_and_recreation_venue-public_swimming_pool"),
    (lambda name, websites, social_content: ("playground" in name.lower() or "parco giochi" in name.lower()) or \
                                           (social_content and ("playground" in social_content.lower() or "parco giochi" in social_content.lower())), "attractions_and_activities-playground"),

    # Automotive -> automotive
    (lambda name, websites, social_content: ("car wash" in name.lower() or "autolavaggio" in name.lower()) or \
                                           (social_content and ("car wash" in social_content.lower() or "autolavaggio" in social_content.lower())), "automotive-automotive_services_and_repair-car_wash_and_detail"),
    (lambda name, websites, social_content: ("car repair" in name.lower() or "mechanic" in name.lower() or "officina" in name.lower() or "carrozzeria" in name.lower() or "elettrauto" in name.lower()) or \
                                           (social_content and ("car repair" in social_content.lower() or "mechanic" in social_content.lower() or "officina" in social_content.lower() or "carrozzeria" in social_content.lower() or "elettrauto" in social_content.lower())), "automotive-automotive_services_and_repair"),
    (lambda name, websites, social_content: ("car dealer" in name.lower() or "concessionaria auto" in name.lower()) or \
                                           (social_content and ("car dealer" in social_content.lower() or "concessionaria auto" in social_content.lower())), "automotive-automotive_dealer"),
    (lambda name, websites, social_content: ("tire shop" in name.lower() or "gommista" in name.lower()) or \
                                           (social_content and ("tire shop" in social_content.lower() or "gommista" in social_content.lower())), "automotive-automotive_parts_and_accessories-tire_shop"),

    # Other common POIs / Services -> beauty_and_spa, professional_services, travel, real_estate
    (lambda name, websites, social_content: (("hair salon" in name.lower() or "barber" in name.lower() or "parrucchiere" in name.lower() or "salone di bellezza" in name.lower()) and "estetico" not in name.lower()) or \
                                           (social_content and (("hair salon" in social_content.lower() or "barber" in social_content.lower() or "parrucchiere" in social_content.lower() or "salone di bellezza" in social_content.lower()) and "estetico" not in social_content.lower())), "beauty_and_spa-hair_salon"),
    (lambda name, websites, social_c: ("beauty salon" in name.lower() or "centro estetico" in name.lower() or "istituto di bellezza" in name.lower()) or \
                                           (social_content and ("beauty salon" in social_content.lower() or "centro estetico" in social_content.lower() or "istituto di bellezza" in social_content.lower())), "beauty_and_spa-beauty_salon"),
    (lambda name, websites, social_content: ("laundry" in name.lower() or "launderette" in name.lower() or "lavanderia" in name.lower()) or \
                                           (social_content and ("laundry" in social_content.lower() or "launderette" in social_content.lower() or "lavanderia" in social_content.lower())), "professional_services-laundry_services"),
    (lambda name, websites, social_content: ("travel agency" in name.lower() or "agenzia di viaggi" in name.lower()) or \
                                           (social_content and ("travel agency" in social_content.lower() or "agenzia di viaggi" in social_content.lower())), "travel-travel_services-travel_agency"),
    (lambda name, websites, social_content: ("real estate agency" in name.lower() or "agenzia immobiliare" in name.lower()) or \
                                           (social_content and ("real estate agency" in social_content.lower() or "agenzia immobiliare" in social_content.lower())), "real_estate-real_estate_agent_and_broker"),
    (lambda name, websites, social_content: ("funeral home" in name.lower() or "onoranze funebri" in name.lower()) or \
                                           (social_content and ("funeral home" in social_content.lower() or "onoranze funebri" in social_content.lower())), "professional_services-funeral_services_and_cemeteries-funeral_service"),
    (lambda name, websites, social_content: ("cemetery" in name.lower() or "cimitero" in name.lower()) or \
                                           (social_content and ("cemetery" in social_content.lower() or "cimitero" in social_content.lower())), "professional_services-funeral_services_and_cemeteries-cemetery"),
]

In [None]:
def rule_based_category(poi_name, poi_websites, poi_social_content):
    """
    Categorizes a POI based on its name, websites, and social media content using predefined rules.
    Args:
        poi_name (str): The primary name of the POI.
        poi_websites (list): A list of website URLs for the POI, can be None or empty.
        poi_social_content (str): Scraped content from the POI's social media pages.
    Returns:
        str or None: The predicted category string if a rule matches, otherwise None.
    """
    name_str = poi_name if isinstance(poi_name, str) else ""
    # poi_websites is passed as a list (or None)
    social_content_str = poi_social_content if isinstance(poi_social_content, str) else ""
    
    # RULES is assumed to be defined elsewhere in the notebook
    # Example: RULES = [(lambda n, w, s: "restaurant" in n.lower(), "Restaurant"), ...]
    for test_condition, category in RULES:
        # The lambda functions in RULES expect (name, websites, social_content)
        if test_condition(name_str, poi_websites, social_content_str):
            return category
    return None

# --- Applying the rule-based categorization to the place_dataset ---
print("\n--- Starting Rule-Based Categorization ---")
rule_based_predictions = []
pois_categorized_by_rules = 0
total_pois_processed = 0
poi_category_string_output = "" # Initialize the string to store POI name to category mappings

# Ensure place_dataset is available and is a pandas DataFrame (or GeoDataFrame)
if 'place_dataset' in locals() and isinstance(place_dataset, pandas.DataFrame) and not place_dataset.empty:
    # Ensure scraped_socials_content is available
    if 'scraped_socials_content' not in locals() or not isinstance(scraped_socials_content, dict):
        print("Warning: 'scraped_socials_content' not found or not a dictionary. Social content will not be used for rule-based categorization.")
        # Initialize to empty dict to prevent errors later, though it won't provide social content
        scraped_socials_content = {}

    for index, poi_row in tqdm.tqdm(place_dataset.iterrows(), total=place_dataset.shape[0], desc="Categorizing POIs"):
        total_pois_processed += 1
        
        primary_name = ""
        # Safely access primary name
        if 'names' in poi_row and poi_row['names'] and isinstance(poi_row['names'], dict) and 'primary' in poi_row['names']:
            primary_name = poi_row['names']['primary']
        
        # Websites can be None or a list
        websites_list = poi_row['websites'] if 'websites' in poi_row and isinstance(poi_row['websites'], (list, np.ndarray)) else None
        if isinstance(websites_list, np.ndarray): # Ensure it's a python list for consistency if it's an array
            websites_list = websites_list.tolist()


        # Retrieve and prepare social media content for the current POI
        social_content_for_poi_list = scraped_socials_content.get(primary_name, [])
        all_social_texts = []
        if isinstance(social_content_for_poi_list, list):
            for content_item_dict in social_content_for_poi_list:
                if isinstance(content_item_dict, dict):
                    for text_val in content_item_dict.values(): # content_item_dict is {url: content_string}
                        if isinstance(text_val, str):
                            all_social_texts.append(text_val)
        
        social_content_str = " ".join(all_social_texts)

        predicted_category = rule_based_category(primary_name, websites_list, social_content_str)
        
        if predicted_category:
            pois_categorized_by_rules += 1
        
        rule_based_predictions.append({
            'id': poi_row['id'] if 'id' in poi_row else None,
            'name': primary_name,
            'predicted_category_by_rule': predicted_category
        })

    if total_pois_processed > 0:
        coverage_percentage = (pois_categorized_by_rules / total_pois_processed) * 100
        print(f"Rule-based categorization coverage: {pois_categorized_by_rules}/{total_pois_processed} POIs matched ({coverage_percentage:.2f}%).")
    else:
        print("No POIs in place_dataset to process.")

    # Convert predictions to a DataFrame for easier inspection or further use
    rule_based_df = pandas.DataFrame(rule_based_predictions)
    print("\nRule-based predictions (first 20 rows):")
    print(rule_based_df.head(20))
    
    # --- Storing predictions as a formatted string ---
    print("\n--- POI Name to Predicted Category Mappings (String Format) ---")
    prediction_output_list = []
    if not rule_based_df.empty:
        for index, row in rule_based_df.iterrows():
            poi_name = row['name']
            # Ensure category is a string, even if None, for the f-string
            predicted_category_str = str(row['predicted_category_by_rule'])
            prediction_output_list.append(f'"{poi_name}": "{predicted_category_str}"')
        
        # Join all prediction strings into a single string, separated by commas
        poi_category_string_output = ", ".join(prediction_output_list)
        
        print(poi_category_string_output)
    else:
        print("No predictions to format into a string.")

else:
    print("Warning: 'place_dataset' not found, not a DataFrame, or is empty. Rule-based categorization skipped.")

print("--- Rule-Based Categorization Finished ---")

In [None]:
print(poi_category_string_output)

In [None]:
# Set up the API key
api_key = "AIzaSyC6T4kHE97JdNR4LWUVfwwZvXwIZwdBx78"

In [None]:
import os
import google.generativeai as genai
import dspy

genai.configure(api_key=api_key)
gemini_api_key = os.getenv("api_key")

# Instantiating and configuring the Gemini model
lm_gemini = dspy.LM("gemini-2.0-flash", api_key=gemini_api_key)
dspy.configure(lm=lm_gemini)
model = genai.GenerativeModel("gemini-2.0-flash")

context = f"""
You are a Category Suggestion Agent. Your job is to read a POI record that I provide you with, consisting of the POI,
rule‐based search results, and data scraped from its website. Now, using this information, propose a category for it
which is as detailed as possible (e.g. output “italian restaurant” instead of “restaurant”) based on its name,
rule‐based search results, and web‐scraped data. Categorize only when you are sure.

These are all the categories in the schema: {categories_list}.
This is a JSON file containing the category tree: {category_tree}.

I will provide you with POI names and website‐scraped information in this format:
  'name': 'website scraped information'
Here are those POIs with website data: {scraped_websites}

I will also provide POI names and rule‐based search results in this format:
  'name': 'rule‐based search category prediction'
Here are those POIs with rule‐based results: {poi_category_string_output}

Finally, I will provide POI names with social‐media scrape results in this format:
  'name': 'socials scraped information'
Here are those POIs with social data: {scraped_socials_string}

Provide me with a category that best suits the primary category for each POI (from the list of categories above).
Choose as detailed as possible, and only select categories you are mostly sure about.

The response should be in the format:
  'name': 'predicted category & confidence percentage'

Be honest: if you are not sure, say so. Do not provide anything else—no explanations, no extra fields.
You do not need to return JSON; just output one line per POI, each as a Python‐style string.

# If you need more information to be certain, let me know.
"""

In [None]:
# -- A. Use 2.5-pro to generate the executor prompt based on the original context
prompt_response = llm_prompt.generate(prompt=context)
full_text = prompt_response.text  # get the combined instructions + EXECUTE_PROMPT
if "EXECUTE_PROMPT:" in full_text:
    executor_prompt = full_text.split("EXECUTE_PROMPT:")[1].strip().strip('"')
else:
    executor_prompt = full_text  # fallback if no explicit marker

# -- B. Run the executor prompt through 2.0-flash via DSPY
execution_response = dspy.generate(prompt=executor_prompt)
print(execution_response.text)


In [None]:
# Assuming 'response' is the variable holding the LLM's output from the previous cell
# and response.text contains the string with predictions.

predicted_categories_string = "" # Initialize the string to store categories

if 'response' in locals() and hasattr(response, 'text'):
    llm_output_lines = response.text.split('\n')
    predicted_categories_only = []
    predicted_categories_list_for_string = [] # New list to build the string

    for line in llm_output_lines:
        if ':' in line:
            try:
                # Split by the first occurrence of ': ' to separate name from prediction
                parts = line.split(': ', 1)
                if len(parts) > 1:
                    prediction_part = parts[1].strip()
                    
                    # Remove potential surrounding quotes from the prediction part
                    if prediction_part.startswith("'") and prediction_part.endswith("'"):
                        prediction_part = prediction_part[1:-1]
                    
                    # Split the prediction part by ' & ' to separate category from confidence
                    category_confidence_split = prediction_part.split(' & ', 1)
                    category = category_confidence_split[0]
                    
                    # Further clean up if the category itself has quotes
                    if category.startswith("'") and category.endswith("'"):
                        category = category[1:-1]
                        
                    if category: # Modified condition
                        predicted_categories_only.append(category)
                        predicted_categories_list_for_string.append(category) # Add to new list
                        print(category)
            except Exception as e:
                print(f"Could not parse line: {line} - Error: {e}")
        elif line.strip(): # Handle cases where a line might just be a category
             # Further clean up if the category itself has quotes
            category = line.strip()
            if category.startswith("'") and category.endswith("'"):
                category = category[1:-1]
            if category: # Modified condition
                predicted_categories_only.append(category)
                predicted_categories_list_for_string.append(category) # Add to new list
                print(category)

    if not predicted_categories_only:
        print("No categories were extracted from the LLM response.")
    else:
        # Store categories in a newline-separated string
        predicted_categories_string = "\n".join(predicted_categories_list_for_string)
        print("\n--- Categories stored in string ---")
        print(predicted_categories_string)

else:
    print("LLM response object 'response' or 'response.text' not found.")

# The variable 'predicted_categories_string' now holds the desired string.
# You can print it to verify or use it for comparison later.
# print(f"\nFinal string for comparison:\n{predicted_categories_string}")


In [None]:
# Iterate through the place_dataset and print the POI name and its primary category
if 'place_dataset' in locals() and isinstance(place_dataset, pandas.DataFrame) and not place_dataset.empty:
    print("POI Name: Primary Category")
    print("---------------------------")
    for index, poi_row in place_dataset.iterrows():
        primary_name = "N/A"
        if 'names' in poi_row and poi_row['names'] and isinstance(poi_row['names'], dict) and 'primary' in poi_row['names']:
            primary_name = poi_row['names']['primary']
        
        primary_category = "N/A"
        if 'categories' in poi_row and poi_row['categories'] and isinstance(poi_row['categories'], dict) and 'primary' in poi_row['categories']:
            primary_category = poi_row['categories']['primary']
            
        print(f'"{primary_name}": "{primary_category}"')
else:
    print("place_dataset is not available or is empty.")

In [None]:
# Initialize an empty list to store primary categories
primary_categories_list = []

if 'place_dataset' in locals() and isinstance(place_dataset, pandas.DataFrame) and not place_dataset.empty:
    for index, poi_row in place_dataset.iterrows():
        primary_category = "N/A"
        if 'categories' in poi_row and poi_row['categories'] and isinstance(poi_row['categories'], dict) and 'primary' in poi_row['categories']:
            primary_category = poi_row['categories']['primary']
        primary_categories_list.append(primary_category)
    
    # Join the list into a newline-separated string
    primary_categories_string = "\n".join(primary_categories_list)
    print(primary_categories_string)
else:
    primary_categories_string = "" # Initialize as empty if dataset is not valid
    print("place_dataset is not available, not a DataFrame, or is empty. No categories stored.")

# The variable 'primary_categories_string' now holds the desired string.
# You can print it to verify or use it for comparison later.
# print(f"\nFinal string for comparison:\n{primary_categories_string}")

In [None]:
# Ensure both strings are available
if 'primary_categories_string' in locals() and 'predicted_categories_string' in locals() \
    and isinstance(primary_categories_string, str) and isinstance(predicted_categories_string, str):

     primary_list = primary_categories_string.strip().split('\n')
     predicted_list = predicted_categories_string.strip().split('\n')

     # Remove any empty strings that might result from splitting if there are trailing newlines
     primary_list = [cat for cat in primary_list if cat]
     predicted_list = [cat for cat in predicted_list if cat]
     
     similar_count = 0
     total_compared = 0
     unmatched_llm_pois = [] # To store LLM predictions that didn't match
     unmatched_for_review = [] # To store (original, predicted) pairs for mismatches

     # Determine the number of comparisons to make (the shorter of the two lists)
     # This is important if the LLM didn't provide a prediction for every POI
     num_comparisons = min(len(primary_list), len(predicted_list))

     print(f"Comparing {num_comparisons} pairs of categories.")
     print("--- Comparison Results ---")
     print("Original Category  | Predicted Category | Match?")
     print("-------------------|--------------------|--------")

     for i in range(num_comparisons):
          original_cat = primary_list[i].strip()
          predicted_cat = predicted_list[i].strip()
          total_compared += 1
          
          match_status = "No"
          if original_cat == predicted_cat:
                similar_count += 1
                match_status = "Yes"
          else:
                # Store mismatches for review
                unmatched_for_review.append({'original': original_cat, 'predicted': predicted_cat, 'index_in_list': i})
                
          print(f"{original_cat:<20}| {predicted_cat:<20}| {match_status}")

     print(f"\nNumber of matching categories: {similar_count}")
     print(f"Total categories compared: {total_compared}")

     if total_compared > 0:
          accuracy = (similar_count / total_compared) * 100
          print(f"Accuracy: {accuracy:.2f}%")
     else:
          print("No categories were compared.")

     # Report if lists had different lengths
     if len(primary_list) != len(predicted_list):
          print(f"\nWarning: Original categories list had {len(primary_list)} entries, "
                  f"while predicted categories list had {len(predicted_list)} entries. "
                  f"Comparison was done for the first {num_comparisons} entries.")

     # Print details of mismatches for review
    #  if unmatched_for_review:
    #       print("\n--- Mismatched Categories for Review ---")
    #       for item in unmatched_for_review:
    #             print(f"Index: {item['index_in_list']}, Original: '{item['original']}', Predicted: '{item['predicted']}'")
    #  else:
    #       if total_compared > 0:
    #             print("\nAll compared categories matched!")

else:
     print("Error: 'primary_categories_string' or 'predicted_categories_string' not found or not a string.")
     print("Please ensure the previous cells have been executed correctly.")


In [None]:
import google.generativeai as genai
import dspy
import os

# I’m configuring the Google Generative AI with my API key.
genai.configure(api_key=api_key)
gemini_api_key = os.getenv("api_key")  # grabbing the key from environment

# Instantiate two Gemini LMs via DSPY:
#  - gemini-2.5-pro as the prompt-generator
#  - gemini-2.0-flash as the executor
llm_prompt = dspy.LM(
    "gemini-2.5-pro", api_key=gemini_api_key
)
llm_exec = dspy.LM(
    "gemini-2.0-flash", api_key=gemini_api_key
)

# Configure DSPY to use the executor model by default
dspy.configure(lm=llm_exec)


In [None]:
# -- A. Use 2.5-pro to generate the executor prompt for comparison
comp_prompt_response = llm_prompt.generate(prompt=comparison_context)
comp_full = comp_prompt_response.text
if "EXECUTE_PROMPT:" in comp_full:
    comp_executor_prompt = comp_full.split("EXECUTE_PROMPT:")[1].strip().strip('"')
else:
    comp_executor_prompt = comp_full

# -- B. Run the comparison prompt with 2.0-flash via DSPY
Comparision = dspy.generate(prompt=comp_executor_prompt)
print(Comparision.text)

# Calculate and print the percentage of 'yes' answers
if 'Comparision' in locals() and hasattr(Comparision, 'text') and Comparision.text:
    answers = Comparision.text.strip().split('\n')
    answers = [ans.strip().lower() for ans in answers if ans.strip()]
    total_answers = len(answers)
    yes_answers = answers.count('yes')
    if total_answers > 0:
        yes_percentage = (yes_answers / total_answers) * 100
        print(f"\nTotal 'yes' answers: {yes_answers}")
        print(f"Total answers from LLM: {total_answers}")
        print(f"Percentage of 'yes' answers: {yes_percentage:.2f}%")
    else:
        print("\nNo answers found in LLM Comparision output to calculate statistics.")
else:
    print("\nLLM Comparision output 'Comparision.text' not found or is empty.")
