In [1]:
import os

from dotenv import load_dotenv
from etg import GuestRoom

load_dotenv()

# ETG API Credentials
ETG_KEY_ID = os.environ["ETG_KEY_ID"]
ETG_API_KEY = os.environ["ETG_API_KEY"]
ETG_REQUEST_TIMEOUT = 30.0

SCORING_MODEL = os.environ["SCORING_MODEL"]

# Search Parameters
CITY = "–ú–æ—Å–∫–≤–∞"

CHECKIN_DATE = "2026-02-02"
CHECKOUT_DATE = "2026-02-04"

CURRENCY = "RUB"
LANGUAGE = "ru"
RESIDENCY = "RU"

GUESTS: list[GuestRoom] = [{"adults": 2, "children": [4, 2]}]
LIMIT = 1000

# User preferences for AI
USER_PREFERENCES = "–û–±—è–∑–∞—Ç–µ–ª—å–Ω–æ –¥–≤–µ –∫–æ–º–Ω–∞—Ç—ã –∏ –¥–≤–µ –∫—Ä–æ–≤–∞—Ç–∏. –•–æ—Ä–æ—à–∏–µ –æ—Ç–∑—ã–≤—ã. –ß–∏—Å—Ç–æ—Ç–∞"

# Filters
MIN_PRICE: float | None = 3000.0  # None = no minimum
MAX_PRICE: float | None = 20000.0  # None = no maximum

In [2]:
from etg import ETGClient
from utils import ostrovok_url

client = ETGClient(ETG_KEY_ID, ETG_API_KEY, timeout=ETG_REQUEST_TIMEOUT)

In [3]:
async def find_region_id(client: ETGClient, city_name: str, language: str) -> int | None:
    """Find region ID for a city name."""
    print(f"Looking up region ID for '{city_name}'...")
    regions = await client.suggest_region(city_name, language)

    if not regions:
        print(f"  No regions found for '{city_name}'")
        return None

    # Only accept City type
    for region in regions:
        if region["type"] == "City":
            region_id = region["id"]
            print(f"  Found: {region['name']} ({region.get('country_code', '')}), region_id={region_id}")
            return region_id

    # No city found - show available options
    print(f"  No city found. Available regions:")
    for region in regions[:5]:
        print(f"    - {region['name']} (type: {region['type']}, id: {region['id']})")

    return None

In [4]:
# Find region by city name.
# ETG API requires region_id for hotel search, so we first
# lookup the region ID via suggest_region by city name.
region_id = await find_region_id(client, CITY, LANGUAGE)
if not region_id:
    raise ValueError(f"Could not find region for '{CITY}'")

print(f"\nSearching hotels in {CITY}...")
print(f"  Dates: {CHECKIN_DATE} to {CHECKOUT_DATE}")
print(f"  Currency: {CURRENCY}, Limit: {LIMIT}")

  Found: –ú–æ—Å–∫–≤–∞ (RU), region_id=2395

Searching hotels in –ú–æ—Å–∫–≤–∞...
  Dates: 2026-02-02 to 2026-02-04
  Currency: RUB, Limit: 1000


In [5]:
import pandas as pd

from services import filter_hotels_by_price

# Search available hotels in the region with given parameters.
# Filters by price range if MIN_PRICE/MAX_PRICE are set.
# Returns short hotel info: id, hid, and rates (room name, price, meal).
# Full content (name, address, amenities) is fetched separately via hotel content API.
# Search available hotels in the region with given parameters.
# Filters by price range if MIN_PRICE/MAX_PRICE are set.
# Returns short hotel info: id, hid, and rates (room name, price, meal).
# Full content (name, address, amenities) is fetched separately via hotel content API.
search_results = await client.search_hotels_by_region(
    region_id=region_id,
    checkin=CHECKIN_DATE,
    checkout=CHECKOUT_DATE,
    residency=RESIDENCY,
    guests=GUESTS,
    currency=CURRENCY,
    language=LANGUAGE,
    hotels_limit=LIMIT,
)

all_hotels = search_results.get("hotels", [])
total_available = search_results.get("total_hotels", len(all_hotels))

# Filter by price
hotels = filter_hotels_by_price(all_hotels, MIN_PRICE, MAX_PRICE)
total_after_filter = len(hotels)

In [6]:
if not hotels:
    print("No hotels found for the given criteria.")
    df_hotels = pd.DataFrame()
else:
    print(f"Found {total_after_filter} hotels after price filter (total available: {total_available})\n")

    # Create DataFrame
    hotels_data = []
    for hotel in hotels:
        rates = hotel.get("rates", [])
        if rates:
            first_rate = rates[0]
            payment_types = first_rate.get("payment_options", {}).get("payment_types", [])
            price = float(payment_types[0].get("show_amount", 0)) if payment_types else None
            room_name = first_rate.get("room_name", "")
            meal = first_rate.get("meal", "")
        else:
            price = None
            room_name = ""
            meal = ""

        hotels_data.append({
            "hotel_id": hotel["id"],
            "hid": hotel["hid"],
            "price": price,
            "currency": CURRENCY,
            "room": room_name[:40] if room_name else "",
            "meal": meal,
        })

    df_hotels = pd.DataFrame(hotels_data)
    df_hotels = df_hotels.sort_values("price", ascending=True).reset_index(drop=True)
    df_hotels.index += 1  # Start from 1

    # Display settings
    pd.set_option("display.max_colwidth", 50)
    pd.set_option("display.max_rows", 100)

df_hotels

Unnamed: 0,hotel_id,hid,price,currency,room,meal
1,na_baumanskoy_hotel,10613383,6074.0,RUB,–ß–µ—Ç—ã—Ä—ë—Ö–º–µ—Å—Ç–Ω—ã–π –Ω–æ–º–µ—Ä —Å–µ–º–µ–π–Ω—ã–π –≠–∫–æ–Ω–æ–º-–∫–ª–∞,nomeal
2,apartamentyi_sadovoe_koltso_izumrudnaya,8510363,6970.0,RUB,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã (–ø–∏—Ç–∞–Ω–∏–µ –¥–ª—è –¥–µ—Ç–µ–π –Ω–µ –≤–∫–ª—é—á–µ,nomeal
3,zvezda_hotel_6,10437016,7000.0,RUB,–¢—Ä—ë—Ö–º–µ—Å—Ç–Ω—ã–π –Ω–æ–º–µ—Ä Comfort (–¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞,nomeal
4,minigostinitsa_brusnika_polezhaevskaya,8551959,7149.0,RUB,–î–≤—É—Ö–º–µ—Å—Ç–Ω—ã–π –Ω–æ–º–µ—Ä –ö–æ–º—Ñ–æ—Ä—Ç –ü–ª—é—Å (–¥–≤—É—Å–ø–∞–ª—å,nomeal
5,minihotel_ladomir_na_yauze,8662675,7212.0,RUB,–î–≤—É—Ö–º–µ—Å—Ç–Ω—ã–π –Ω–æ–º–µ—Ä –°–µ–º–µ–π–Ω—ã–π –£–ª—É—á—à–µ–Ω–Ω—ã–π 2,nomeal
...,...,...,...,...,...,...
270,russkie_sezonyi,8848163,38106.0,RUB,–î–≤—É—Ö–º–µ—Å—Ç–Ω—ã–π –ª—é–∫—Å with Sofa Superior –ú–∞–Ω—Å,nomeal
271,panorama_aparthotel,9210977,38180.0,RUB,–î–≤—É—Ö–º–µ—Å—Ç–Ω—ã–π –°—å—é—Ç –ü–∞–Ω–æ—Ä–∞–º–Ω—ã–π (–¥–≤—É—Å–ø–∞–ª—å–Ω–∞—è,nomeal
272,hampton_by_hilton_moscow_rogozhsky_val,9770069,38522.0,RUB,–ù–æ–º–µ—Ä —Å –¥–∏–≤–∞–Ω–æ–º –°–µ–º–µ–π–Ω—ã–π —Å –∫—Ä–æ–≤–∞—Ç—å—é King,breakfast
273,grand_revival_hotel,10004948,38836.0,RUB,–ß–µ—Ç—ã—Ä—ë—Ö–º–µ—Å—Ç–Ω—ã–π –Ω–æ–º–µ—Ä –ë–æ–ª—å—à–æ–π –£–ª—É—á—à–µ–Ω–Ω—ã–π,nomeal


In [7]:
from services import batch_get_content

hotel_ids = [h["hid"] for h in hotels]

print(f"[batch_get_content_start] –ó–∞–≥—Ä—É–∑–∫–∞ –∫–æ–Ω—Ç–µ–Ω—Ç–∞ –¥–ª—è {len(hotel_ids)} –æ—Ç–µ–ª–µ–π...")
content_map = await batch_get_content(client, hotel_ids, LANGUAGE)
print(f"[batch_get_content_done] –ó–∞–≥—Ä—É–∂–µ–Ω –∫–æ–Ω—Ç–µ–Ω—Ç –¥–ª—è {len(content_map)} –∏–∑ {len(hotel_ids)} –æ—Ç–µ–ª–µ–π")

[batch_get_content_done] –ó–∞–≥—Ä—É–∂–µ–Ω –∫–æ–Ω—Ç–µ–Ω—Ç –¥–ª—è 274 –∏–∑ 274 –æ—Ç–µ–ª–µ–π


In [8]:
content_data = []
for hotel_hid, content in content_map.items():
    content_data.append({
        "hid": hotel_hid,
        "name": content.get("name", "")[:40],
        "stars": content.get("star_rating", 0),
        "kind": content.get("kind", ""),
        "address": content.get("address", "")[:50],
        "latitude": content.get("latitude"),
        "longitude": content.get("longitude"),
        "check_in": content.get("check_in_time", ""),
        "check_out": content.get("check_out_time", ""),
    })

df_content = pd.DataFrame(content_data)

# Merge with prices from df_hotels
df_full = df_hotels.merge(df_content, on="hid", how="left")
df_full = df_full[["hotel_id", "name", "stars", "kind", "price", "meal", "address"]]
df_full = df_full.sort_values("price", ascending=True).reset_index(drop=True)
df_full.index += 1

df_full

Unnamed: 0,hotel_id,name,stars,kind,price,meal,address
1,na_baumanskoy_hotel,–û—Ç–µ–ª—å –Ω–∞ –ë–∞—É–º–∞–Ω—Å–∫–æ–π,0,Hotel,6074.0,nomeal,"—É–ª–∏—Ü–∞ –§—Ä–∏–¥—Ä–∏—Ö–∞ –≠–Ω–≥–µ–ª—å—Å–∞, –¥.31/35, –ú–æ—Å–∫–≤–∞"
2,apartamentyi_sadovoe_koltso_izumrudnaya,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –°–∞–¥–æ–≤–æ–µ –ö–æ–ª—å—Ü–æ –ò–∑—É–º—Ä—É–¥–Ω–∞—è,0,Apartment,6970.0,nomeal,"–ò–∑—É–º—Ä—É–¥–Ω–∞—è —É–ª–∏—Ü–∞, –¥.6, –ú–æ—Å–∫–≤–∞"
3,zvezda_hotel_6,–ì–æ—Ä–æ–¥—Å–∫–æ–π –æ—Ç–µ–ª—å –ó–≤–µ–∑–¥–∞,0,Hotel,7000.0,nomeal,"–≥. –õ—é–±–µ—Ä—Ü—ã —É–ª. 3-–µ –ü–æ—á—Ç–æ–≤–æ–µ –û—Ç–¥–µ–ª–µ–Ω–∏–µ, –¥. 44–ê,..."
4,minigostinitsa_brusnika_polezhaevskaya,–ú–∏–Ω–∏-–û—Ç–µ–ª—å –ë—Ä—É—Å–Ω–∏–∫–∞ –ü–æ–ª–µ–∂–∞–µ–≤—Å–∫–∞—è,0,Mini-hotel,7149.0,nomeal,"–•–æ—Ä–æ—à–µ–≤—Å–∫–æ–µ —à–æ—Å—Å–µ, –¥.23–∫2 –ø–æ–º–µ—â.1/1, –ú–æ—Å–∫–≤–∞"
5,minihotel_ladomir_na_yauze,–û—Ç–µ–ª—å –õ–∞–¥–æ–º–∏—Ä –Ω–∞ –Ø—É–∑–µ,0,Hotel,7212.0,nomeal,"–ë–æ–ª—å—à–æ–π –ú–∞—Ç—Ä–æ—Å—Å–∫–∏–π –ø–µ—Ä–µ—É–ª–æ–∫, –¥.1 –ø–æ–º–µ—â 1/1, –ú–æ..."
...,...,...,...,...,...,...,...
270,russkie_sezonyi,–ë—É—Ç–∏–∫-–û—Ç–µ–ª—å –†–µ–≥—É–ª,5,Boutique_and_Design,38106.0,nomeal,"–î–µ–≥—Ç—è—Ä–Ω—ã–π –ø–µ—Ä–µ—É–ª–æ–∫, –¥. 8/2, –ú–æ—Å–∫–≤–∞"
271,panorama_aparthotel,–ê–ø–∞—Ä—Ç-–æ—Ç–µ–ª—å ¬´Ôªø–ù–∞ –≤—ã—Å–æ—Ç–µÔªø¬ª,4,Apart-hotel,38180.0,nomeal,"—É–ª–∏—Ü–∞ –ú–æ—Å—Ñ–∏–ª—å–º–æ–≤—Å–∫–∞—è, –¥–æ–º 74–ë, –ú–æ—Å–∫–≤–∞"
272,hampton_by_hilton_moscow_rogozhsky_val,–û—Ç–µ–ª—å Hampton by Hilton Moscow –†–æ–≥–æ–∂—Å–∫–∏–π,4,Hotel,38522.0,breakfast,"—É–ª–∏—Ü–∞ –†–æ–≥–æ–∂—Å–∫–∏–π –í–∞–ª, –¥–æ–º 10, –ú–æ—Å–∫–≤–∞"
273,grand_revival_hotel,–û—Ç–µ–ª—å Revival Hotel,3,Hotel,38836.0,nomeal,"—É–ª–∏—Ü–∞ –ü–µ—Ç—Ä–æ–≤–∫–∞ 19 —Å—Ç—Ä.3, –ú–æ—Å–∫–≤–∞"


In [9]:
from services import batch_get_reviews, filter_reviews

print(f"[batch_get_reviews_start] –ó–∞–≥—Ä—É–∑–∫–∞ –æ—Ç–∑—ã–≤–æ–≤ –¥–ª—è {len(hotel_ids)} –æ—Ç–µ–ª–µ–π...")
raw_reviews_payload = await batch_get_reviews(client, hotel_ids, LANGUAGE)
reviews_map = filter_reviews(raw_reviews_payload)

total_raw = sum(rd["total_reviews"] for rd in raw_reviews_payload.values())
total_filtered = sum(len(rd["reviews"]) for rd in reviews_map.values())
# Compute filtered_by_age: how many reviews were filtered out
total_raw_in_filtered = sum(rd["total_reviews"] for rd in reviews_map.values())
total_filtered_by_age = total_raw_in_filtered - total_filtered
hotels_with_reviews = len(reviews_map)

# Calculate average rating across all hotels
all_avg_ratings = [rd["avg_rating"] for rd in reviews_map.values() if rd["avg_rating"] is not None]
overall_avg = sum(all_avg_ratings) / len(all_avg_ratings) if all_avg_ratings else 0

print(
    f"[batch_get_reviews_done] –í—Å–µ–≥–æ {hotels_with_reviews} –æ—Ç–µ–ª–µ–π —Å –æ—Ç–∑—ã–≤–∞–º–∏ –∏–∑ {len(hotel_ids)}"
)
print(
    f"  –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ {total_raw} –æ—Ç–∑—ã–≤–æ–≤ ‚Üí {total_filtered} —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö "
    f"(–æ—Ç—Å–µ—á–µ–Ω–æ –ø–æ –¥–∞–≤–Ω–æ—Å—Ç–∏: {total_filtered_by_age})"
)
print(f"  –°—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥: {overall_avg:.1f}/10")

[batch_get_reviews_done] –í—Å–µ–≥–æ 271 –æ—Ç–µ–ª–µ–π —Å –æ—Ç–∑—ã–≤–∞–º–∏ –∏–∑ 274
  –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 44720 –æ—Ç–∑—ã–≤–æ–≤ ‚Üí 34830 —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö (–æ—Ç—Å–µ—á–µ–Ω–æ –ø–æ –¥–∞–≤–Ω–æ—Å—Ç–∏: 9890)
  –°—Ä–µ–¥–Ω–∏–π —Ä–µ–π—Ç–∏–Ω–≥: 8.5/10


In [10]:
# Create DataFrame with reviews summary
reviews_data = []
for hotel_hid, review_data in reviews_map.items():
    hotel_id = next((h["id"] for h in hotels if h["hid"] == hotel_hid), "")
    detailed_scores = review_data["detailed_averages"]
    reviews_data.append({
        "hotel_id": hotel_id,
        "hid": hotel_hid,
        "total": review_data["total_reviews"],
        "avg_rating": review_data["avg_rating"],
        "cleanness": detailed_scores["cleanness"],
        "location": detailed_scores["location"],
        "price": detailed_scores["price"],
        "services": detailed_scores["services"],
        "room": detailed_scores["room"],
        "meal": detailed_scores["meal"],
        "wifi": detailed_scores["wifi"],
        "hygiene": detailed_scores["hygiene"],
    })

df_reviews = pd.DataFrame(reviews_data)
df_reviews = df_reviews.sort_values("avg_rating", ascending=False, na_position="last").reset_index(drop=True)
df_reviews.index += 1

# Merge with hotel info
df_reviews_full = df_reviews.merge(
    df_content[["hid", "name", "stars"]],
    on="hid",
    how="left"
)
df_reviews_full = df_reviews_full[[
    "hotel_id", "name", "stars", "total", "avg_rating",
    "cleanness", "location", "room", "services", "price", "meal", "wifi", "hygiene"
]]


def show_reviews(hotel_id: str, limit: int = 5) -> None:
    """Show reviews for a hotel."""
    hotel_hid = next((h["hid"] for h in hotels if h["id"] == hotel_id), None)
    if not hotel_hid:
        print(f"Hotel '{hotel_id}' not found")
        return
    
    review_data = reviews_map.get(hotel_hid)
    if not review_data:
        print(f"No reviews for hotel '{hotel_id}'")
        return
    
    hotel_name = content_map.get(hotel_hid, {}).get("name", hotel_id)
    avg_rating = review_data["avg_rating"]
    detailed_scores = review_data["detailed_averages"]
    
    print(f"{'='*70}")
    print(f"{hotel_name}")
    print(f"Total: {review_data['total_reviews']} reviews | Avg rating: {avg_rating}/10" if avg_rating else f"Total: {review_data['total_reviews']} reviews")
    print(f"{'='*70}")
    print(f"Detailed scores:")
    print(f"  cleanness: {detailed_scores['cleanness']}  location: {detailed_scores['location']}  room: {detailed_scores['room']}  services: {detailed_scores['services']}")
    print(f"  price: {detailed_scores['price']}  meal: {detailed_scores['meal']}  wifi: {detailed_scores['wifi']}  hygiene: {detailed_scores['hygiene']}")
    print(f"{'='*70}\n")
    
    reviews = review_data["reviews"][:limit]
    for review in reviews:
        rating = review["rating"]
        date = review["created"][:10]
        lang = review.get("_lang", "?")
        plus = review.get("review_plus", "").strip()
        minus = review.get("review_minus", "").strip()
        print(f"[{rating}/10] {date} [{lang}]")
        if plus:
            print(f"  + {plus[:300]}")
        if minus:
            print(f"  - {minus[:300]}")
        print()


df_reviews_full

Unnamed: 0,hotel_id,name,stars,total,avg_rating,cleanness,location,room,services,price,meal,wifi,hygiene
0,osennyaya_apartments,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –û—Å–µ–Ω–Ω—è—è,0,4,10.0,10.0,10.0,10.0,10.0,10.0,,10.0,
1,scandi_na_tverskoy_15_apartments,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã Scandi –Ω–∞ –¢–≤–µ—Ä—Å–∫–æ–π 15,0,7,10.0,10.0,10.0,10.0,10.0,10.0,,10.0,
2,sovremennaya_i_stilnaya_kvartira_ryadom_s_kremlem,–ö–≤–∞—Ä—Ç–∏—Ä–∞ GM Apartments —Ä—è–¥–æ–º —Å –ö—Ä–µ–º–ª–µ–º –Ω,0,3,10.0,10.0,10.0,10.0,10.0,10.0,,10.0,
3,bobolink_cold_na_sokolinoy_gore_lodging_house,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –¥–≤—É—Ö—É—Ä–æ–≤–Ω–µ–≤—ã–µ Cosmo Star –Ω–∞,0,10,10.0,10.0,10.0,10.0,10.0,10.0,,10.0,
4,volgogradskiy_pospekt_325_k3_apartments,–ê–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç—ã –í–æ–ª–≥–æ–≥—Ä–∞–¥—Å–∫–∏–π –ü—Ä–æ—Å–ø–µ–∫—Ç 32/5,0,8,10.0,10.0,10.0,10.0,10.0,10.0,,10.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,na_baumanskoy_hotel,–û—Ç–µ–ª—å –Ω–∞ –ë–∞—É–º–∞–Ω—Å–∫–æ–π,0,45,5.5,4.9,7.2,4.2,6.1,5.6,6.4,10.0,2.0
267,ladomir_na_zelevom_minihotel,–û—Ç–µ–ª—å –õ–∞–¥–æ–º–∏—Ä –Ω–∞ –ó–µ–ª—å–µ–≤–æ–º,0,13,5.2,4.4,7.5,3.9,5.5,3.7,3.8,10.0,
268,zvezda_hotel_6,–ì–æ—Ä–æ–¥—Å–∫–æ–π –æ—Ç–µ–ª—å –ó–≤–µ–∑–¥–∞,0,96,4.8,4.6,5.1,4.0,5.0,4.9,6.2,10.0,5.0
269,hotel_apelsin_vnukovo,–ê–ø–µ–ª—å—Å–∏–Ω –ì–æ—Å—Ç–µ–≤–æ–π –î–æ–º –≤–æ –í–Ω—É–∫–æ–≤–æ,0,35,4.4,4.1,5.9,3.6,4.0,3.8,4.8,10.0,


In [11]:
# Example: view reviews for a specific hotel
# show_reviews("four_seasons_st_petersburg")       # 5 reviews
# show_reviews("four_seasons_st_petersburg", 10)   # 10 reviews

show_reviews("four_seasons_st_petersburg", limit=3)

Hotel 'four_seasons_st_petersburg' not found


In [12]:
from services import combine_hotels_data

combined = combine_hotels_data(hotels, content_map, reviews_map)
print(f"Combined {len(combined)} hotels with content and reviews")

Combined 274 hotels with content and reviews


In [26]:
import json

from services import estimate_tokens, prepare_hotel_for_llm, presort_hotels

# Parameters for review sampling
MAX_REVIEWS_PER_HOTEL = 30
REVIEW_TEXT_MAX_LENGTH = 512

# Estimate tokens before presort
hotels_for_llm_all = [
    prepare_hotel_for_llm(h, MIN_PRICE, MAX_PRICE, MAX_REVIEWS_PER_HOTEL, REVIEW_TEXT_MAX_LENGTH)
    for h in combined
]
tokens_before = estimate_tokens(json.dumps(hotels_for_llm_all, ensure_ascii=False), SCORING_MODEL)

# Pre-sort by hotel kind tier and prescore, limit to top 100 for LLM scoring
PRESORT_LIMIT = 300
top_hotels = presort_hotels(combined, reviews_map, limit=PRESORT_LIMIT)

# Estimate tokens after presort
hotels_for_llm_top = [
    prepare_hotel_for_llm(h, MIN_PRICE, MAX_PRICE, MAX_REVIEWS_PER_HOTEL, REVIEW_TEXT_MAX_LENGTH)
    for h in top_hotels
]
tokens_after = estimate_tokens(json.dumps(hotels_for_llm_top, ensure_ascii=False), SCORING_MODEL)

print(f"[presort_done] {len(combined)} –æ—Ç–µ–ª–µ–π ‚Üí {len(top_hotels)} (–ª–∏–º–∏—Ç {PRESORT_LIMIT})")
print(f"  –¢–æ–∫–µ–Ω—ã: ~{tokens_before:,} ‚Üí ~{tokens_after:,} (—ç–∫–æ–Ω–æ–º–∏—è {tokens_before - tokens_after:,})")

[presort_done] 274 –æ—Ç–µ–ª–µ–π ‚Üí 274 (–ª–∏–º–∏—Ç 300)
  –¢–æ–∫–µ–Ω—ã: ~718,362 ‚Üí ~718,362 (—ç–∫–æ–Ω–æ–º–∏—è 0)


In [25]:
import copy
import json


def tok(x):
    return estimate_tokens(json.dumps(x, ensure_ascii=False), SCORING_MODEL)


full = tok(hotels_for_llm_top)

# Baseline: completely empty hotels
baseline = copy.deepcopy(hotels_for_llm_top)
for h in baseline:
    h["rates"] = []
    h["reviews"] = {"total_reviews": 0, "avg_rating": None, "detailed_averages": {}, "reviews": []}
    h["facts_summary"] = {}
    h["metapolicy_struct"] = {}
    h["serp_filters"] = None
baseline_tokens = tok(baseline)

# ==============================================================================
# REVIEWS BREAKDOWN
# ==============================================================================

# 1.1) Reviews: text content (plus/minus)
no_reviews_text = copy.deepcopy(hotels_for_llm_top)
for h in no_reviews_text:
    for r in h.get("reviews", {}).get("reviews", []):
        r["plus"] = ""
        r["minus"] = ""
reviews_text = full - tok(no_reviews_text)

# 1.2) Reviews: individual metadata (rating/created per review)
no_reviews_individual = copy.deepcopy(no_reviews_text)
for h in no_reviews_individual:
    for r in h.get("reviews", {}).get("reviews", []):
        r["rating"] = None
        r["created"] = ""
reviews_individual = tok(no_reviews_text) - tok(no_reviews_individual)

# 1.3) Reviews: aggregate data (total_reviews/avg_rating/detailed_averages)
no_reviews_aggregates = copy.deepcopy(no_reviews_individual)
for h in no_reviews_aggregates:
    rv = h.get("reviews", {})
    rv["total_reviews"] = 0
    rv["avg_rating"] = None
    rv["detailed_averages"] = {}
reviews_aggregates = tok(no_reviews_individual) - tok(no_reviews_aggregates)

# 1.4) Reviews: structure (empty array)
no_reviews_structure = copy.deepcopy(no_reviews_aggregates)
for h in no_reviews_structure:
    h.get("reviews", {})["reviews"] = []
reviews_structure = tok(no_reviews_aggregates) - tok(no_reviews_structure)

reviews_total = reviews_text + reviews_individual + reviews_aggregates + reviews_structure

# ==============================================================================
# RATES BREAKDOWN
# ==============================================================================

no_reviews = no_reviews_structure

# 2.1) Rates: daily_prices
no_daily_prices = copy.deepcopy(no_reviews)
for h in no_daily_prices:
    for r in h.get("rates", []):
        r["daily_prices"] = []
rates_daily_prices = tok(no_reviews) - tok(no_daily_prices)

# 2.2) Rates: meal_data
no_meal = copy.deepcopy(no_daily_prices)
for h in no_meal:
    for r in h.get("rates", []):
        r["meal_data"] = {}
rates_meal = tok(no_daily_prices) - tok(no_meal)

# 2.3) Rates: room_name
no_room_name = copy.deepcopy(no_meal)
for h in no_room_name:
    for r in h.get("rates", []):
        r["room_name"] = ""
rates_room_name = tok(no_meal) - tok(no_room_name)

# 2.4) Rates: amenities_data
no_amenities = copy.deepcopy(no_room_name)
for h in no_amenities:
    for r in h.get("rates", []):
        r["amenities_data"] = []
rates_amenities = tok(no_room_name) - tok(no_amenities)

# 2.5) Rates: match_hash + deposit
no_rates_meta = copy.deepcopy(no_amenities)
for h in no_rates_meta:
    for r in h.get("rates", []):
        r["match_hash"] = ""
        r["deposit"] = None
rates_meta = tok(no_amenities) - tok(no_rates_meta)

# 2.6) Rates: structure (empty array)
no_rates = copy.deepcopy(no_rates_meta)
for h in no_rates:
    h["rates"] = []
rates_structure = tok(no_rates_meta) - tok(no_rates)

rates_total = rates_daily_prices + rates_meal + rates_room_name + rates_amenities + rates_meta + rates_structure

# ==============================================================================
# HOTEL METADATA BREAKDOWN
# ==============================================================================

# 3.1) Hotel: serp_filters
no_serp = copy.deepcopy(no_rates)
for h in no_serp:
    h["serp_filters"] = None
hotel_serp = tok(no_rates) - tok(no_serp)

# 3.2) Hotel: metapolicy_struct
no_policy = copy.deepcopy(no_serp)
for h in no_policy:
    h["metapolicy_struct"] = {}
hotel_policy = tok(no_serp) - tok(no_policy)

# 3.3) Hotel: facts_summary
no_facts = copy.deepcopy(no_policy)
for h in no_facts:
    h["facts_summary"] = {}
hotel_facts = tok(no_policy) - tok(no_facts)

# 3.4) Hotel: location info (address, check_in/out)
no_location = copy.deepcopy(no_facts)
for h in no_location:
    h["address"] = ""
    h["check_in_time"] = None
    h["check_out_time"] = None
hotel_location = tok(no_facts) - tok(no_location)

# 3.5) Hotel: basic info (id, hid, name, stars, kind, hotel_chain)
hotel_basic = tok(no_location) - baseline_tokens

# 3.6) Hotel: base structure
hotel_structure = baseline_tokens

hotel_total = hotel_serp + hotel_policy + hotel_facts + hotel_location + hotel_basic + hotel_structure

# ==============================================================================
# SUMMARY
# ==============================================================================

total_hotels = len(hotels_for_llm_top)
avg_per_hotel = full / total_hotels if total_hotels > 0 else 0

print(f"{'='*80}")
print(f"TOKEN CONSUMPTION ANALYSIS ({total_hotels} hotels)")
print(f"{'='*80}\n")
print(f"Total tokens: {full:,} (~{avg_per_hotel:.0f} per hotel)\n")

print(f"{'COMPONENT':<30} {'TOKENS':>12} {'%':>8} {'PER HOTEL':>12}")
print(f"{'-'*80}")

components = [
    ("REVIEWS", reviews_total, [
        ("  text (plus/minus)", reviews_text),
        ("  individual meta", reviews_individual),
        ("  aggregates", reviews_aggregates),
        ("  structure", reviews_structure),
    ]),
    ("RATES", rates_total, [
        ("  daily_prices", rates_daily_prices),
        ("  meal_data", rates_meal),
        ("  room_name", rates_room_name),
        ("  amenities_data", rates_amenities),
        ("  match_hash/deposit", rates_meta),
        ("  structure", rates_structure),
    ]),
    ("HOTEL METADATA", hotel_total, [
        ("  serp_filters", hotel_serp),
        ("  metapolicy_struct", hotel_policy),
        ("  facts_summary", hotel_facts),
        ("  location info", hotel_location),
        ("  basic info", hotel_basic),
        ("  structure", hotel_structure),
    ]),
]

for category, cat_total, subcats in components:
    pct = (cat_total / full * 100) if full > 0 else 0
    per_hotel = cat_total / total_hotels if total_hotels > 0 else 0
    print(f"{category:<30} {cat_total:>12,} {pct:>7.2f}% {per_hotel:>11.0f}")
    
    for subcat_name, subcat_tokens in subcats:
        sub_pct = (subcat_tokens / cat_total * 100) if cat_total > 0 else 0
        sub_per_hotel = subcat_tokens / total_hotels if total_hotels > 0 else 0
        print(f"{subcat_name:<30} {subcat_tokens:>12,} {sub_pct:>7.2f}% {sub_per_hotel:>11.0f}")
    print()

# Verification
calculated_total = reviews_total + rates_total + hotel_total
diff = full - calculated_total
if abs(diff) > 10:
    print(f"‚ö†Ô∏è  Warning: Calculated total differs from actual by {diff:,} tokens")
else:
    print(f"‚úì Verification: Total matches (diff: {diff:+,} tokens)")

print(f"\n{'='*80}")
print(f"KEY INSIGHTS")
print(f"{'='*80}")
print(f"‚Ä¢ Reviews text consumes:     {reviews_text:>8,} tokens ({reviews_text/full*100:5.2f}%)")
print(f"‚Ä¢ Rates data consumes:       {rates_total:>8,} tokens ({rates_total/full*100:5.2f}%)")
print(f"‚Ä¢ Hotel metadata consumes:   {hotel_total:>8,} tokens ({hotel_total/full*100:5.2f}%)")
print(f"‚Ä¢ Average per hotel:         {avg_per_hotel:>8,.0f} tokens")

TOKEN CONSUMPTION ANALYSIS (274 hotels)

Total tokens: 402,761 (~1470 per hotel)

COMPONENT                            TOKENS        %    PER HOTEL
--------------------------------------------------------------------------------
REVIEWS                             223,474   55.49%         816
  text (plus/minus)                 173,150   77.48%         632
  individual meta                     5,392    2.41%          20
  aggregates                          8,294    3.71%          30
  structure                          36,638   16.39%         134

RATES                                96,465   23.95%         352
  daily_prices                        5,735    5.95%          21
  meal_data                          17,823   18.48%          65
  room_name                          24,139   25.02%          88
  amenities_data                      7,946    8.24%          29
  match_hash/deposit                 10,307   10.68%          38
  structure                          30,515   31.63%   

In [14]:
import time

from services import finalize_scored_hotels, score_hotels

# Score hotels using single LLM request
# Returns top 10 scored hotels with summary explaining the selection
print(f"[scoring_start] Scoring {len(top_hotels)} hotels...")
start_time = time.time()

scoring_result = await score_hotels(
    top_hotels,
    USER_PREFERENCES,
    guests=GUESTS,
    max_reviews=MAX_REVIEWS_PER_HOTEL,
    review_text_max_length=REVIEW_TEXT_MAX_LENGTH,
    min_price=MIN_PRICE,
    max_price=MAX_PRICE,
    currency=CURRENCY,
    top_count=10
)

elapsed = time.time() - start_time

if scoring_result["error"]:
    print(f"\n‚ùå ERROR: {scoring_result['error']}")
    scored_hotels = None
else:
    scoring_results = scoring_result["results"]
    print(f"[scoring_done] {len(scoring_results)} hotels scored ‚Äî {elapsed:.1f}s")
    print(f"  Estimated tokens: ~{scoring_result['estimated_tokens']:,}")
    print(f"\nüìã Summary:\n{scoring_result['summary']}")
    
    # Finalize scored hotels - merge scoring results with full hotel data
    scored_hotels = finalize_scored_hotels(combined, scoring_results)
    print(f"\n[finalize_done] {len(scored_hotels)} hotels with complete data")


‚ùå ERROR: ModelHTTPError: status_code: 400, model_name: gemini-3-flash-preview, body: {'error': {'code': 400, 'message': 'User location is not supported for the API use.', 'status': 'FAILED_PRECONDITION'}}


In [15]:
from services import HotelScored


def display_top_hotels(
    scored_hotels: list[HotelScored],
    top_n: int = 10,
) -> pd.DataFrame:
    """Display top N scored hotels with details and Ostrovok links."""
    print(f"\n{'='*80}")
    print(f"TOP {top_n} HOTELS")
    print(f"{'='*80}\n")

    data = []
    for i, hotel in enumerate(scored_hotels[:top_n], 1):
        hotel_id = hotel["id"]
        name = hotel["name"]
        hid = hotel["hid"]
        kind = hotel.get("kind", "")
        score = hotel["score"]
        reasons = hotel.get("top_reasons", [])
        penalties = hotel.get("score_penalties", [])
        selected_hash = hotel.get("selected_rate_hash")
        
        # Find selected rate by hash
        rates = hotel.get("rates", [])
        selected_rate = next((r for r in rates if r.get("match_hash") == selected_hash), None)
        
        # Get rate details
        if selected_rate:
            room_name = selected_rate.get("room_name", "")[:50]
            meal_data = selected_rate.get("meal_data", {})
            meal = meal_data.get("value", selected_rate.get("meal", ""))
            
            # Calculate prices from daily_prices
            daily_prices = selected_rate.get("daily_prices", [])
            if daily_prices:
                # Convert string prices to float and sum
                total_price = sum(float(p) for p in daily_prices)
                num_nights = len(daily_prices)
                avg_price_per_night = total_price / num_nights if num_nights > 0 else 0
                
                # Get currency
                pt = selected_rate.get("payment_options", {}).get("payment_types", [])
                currency = pt[0].get("show_currency_code", "") if pt else ""
                
                total_price_str = f"{total_price:.0f} {currency}"
                avg_price_str = f"{avg_price_per_night:.0f} {currency}"
            else:
                # Fallback to payment_types if daily_prices not available
                pt = selected_rate.get("payment_options", {}).get("payment_types", [])
                if pt:
                    total_price = float(pt[0].get("show_amount", 0))
                    currency = pt[0].get("show_currency_code", "")
                    total_price_str = f"{total_price:.0f} {currency}"
                    avg_price_str = f"{total_price:.0f} {currency}"
                else:
                    total_price_str = "N/A"
                    avg_price_str = "N/A"
        else:
            room_name = "N/A"
            meal = "N/A"
            total_price_str = "N/A"
            avg_price_str = "N/A"
        
        # Get reviews data
        reviews = hotel.get("reviews")
        avg_rating = reviews.get("avg_rating") if reviews else None
        detailed = reviews.get("detailed_averages", {}) if reviews else {}
        
        # Generate Ostrovok URL
        url = ostrovok_url(
            hotel_id=hotel_id,
            hid=hid,
            checkin=CHECKIN_DATE,
            checkout=CHECKOUT_DATE,
            guests=GUESTS,
            region_id=region_id,
        )

        # Print detailed info
        print(f"{i}. {name} [{kind}]")
        print(f"   Score: {score}/100 | Rating: {avg_rating}/10" if avg_rating else f"   Score: {score}/100")
        print(f"   Room: {room_name}")
        print(f"   Total: {total_price_str} | Avg per night: {avg_price_str} | Meal: {meal}")
        if reasons:
            print(f"   + {'; '.join(reasons[:3])}")
        if penalties:
            print(f"   - {'; '.join(penalties[:5])}")
        print(f"   üîó {url}")
        print()
        
        # Collect for DataFrame
        data.append({
            "name": name[:35],
            "kind": kind,
            "room": room_name[:30],
            "total": total_price_str,
            "avg/night": avg_price_str,
            "meal": meal,
            "score": score,
            "rating": avg_rating,
            "clean": detailed.get("cleanness"),
            "url": url,
        })
    
    df = pd.DataFrame(data)
    df.index = range(1, len(df) + 1)
    selected = min(top_n, len(scored_hotels))
    print(f"–í—Å–µ–≥–æ –Ω–∞–π–¥–µ–Ω–æ {len(combined)} –æ—Ç–µ–ª–µ–π –Ω–∞ —ç—Ç–∏ –¥–∞—Ç—ã.")
    print(f"–ü–æ–¥–æ–±—Ä–∞–Ω—ã –ª—É—á—à–∏–µ {selected} –ø–æ –≤–∞—à–∏–º –∫—Ä–∏—Ç–µ—Ä–∏—è–º.")
    return df


pd.set_option("display.max_colwidth", 100)
display_top_hotels(scored_hotels, top_n=10)

TypeError: 'NoneType' object is not subscriptable