In [1]:
import os

from dotenv import load_dotenv
from etg import GuestRoom

load_dotenv()

# ETG API Credentials
ETG_KEY_ID = os.environ["ETG_KEY_ID"]
ETG_API_KEY = os.environ["ETG_API_KEY"]
ETG_REQUEST_TIMEOUT = 30.0

# Search Parameters
CITY = "Ярославль"
REGION_ID: int | None = None

CHECKIN_DATE = "2026-02-20"
CHECKOUT_DATE = "2026-02-23"

CURRENCY = "RUB"
LANGUAGE = "ru"
RESIDENCY = "RU"

GUESTS: list[GuestRoom] = [{"adults": 2, "children": [4, 2]}]
LIMIT = 1000

# User preferences for AI
USER_PREFERENCES = "Обязательно две комнаты и две кровати. Хорошие отзывы. Чистота"

# Reviews settings
REVIEWS_PER_SEGMENT = 30
REVIEWS_MAX_AGE_YEARS = 5
NEUTRAL_RATING_THRESHOLD = 7.0
NEGATIVE_RATING_THRESHOLD = 5.0

# Filters
MIN_PRICE: float | None = 3000.0  # None = no minimum
MAX_PRICE: float | None = 10000.0  # None = no maximum

In [2]:
from etg import ETGClient, Hotel, HotelContent

client = ETGClient(ETG_KEY_ID, ETG_API_KEY, timeout=ETG_REQUEST_TIMEOUT)

In [3]:
def find_region_id(client: ETGClient, city_name: str, language: str) -> int | None:
    """Find region ID for a city name."""
    print(f"Looking up region ID for '{city_name}'...")
    regions = client.suggest_region(city_name, language)

    if not regions:
        print(f"  No regions found for '{city_name}'")
        return None

    # Only accept City type
    for region in regions:
        if region["type"] == "City":
            region_id = region["id"]
            print(f"  Found: {region['name']} ({region.get('country_code', '')}), region_id={region_id}")
            return region_id

    # No city found - show available options
    print(f"  No city found. Available regions:")
    for r in regions[:5]:
        print(f"    - {r['name']} (type: {r['type']}, id: {r['id']})")

    return None

In [4]:
# Find region
region_id = REGION_ID or find_region_id(client, CITY, LANGUAGE)
if not region_id:
    raise ValueError(f"Could not find region for '{CITY}'")

print(f"\nSearching hotels in {CITY}...")
print(f"  Dates: {CHECKIN_DATE} to {CHECKOUT_DATE}")
print(f"  Currency: {CURRENCY}, Limit: {LIMIT}")

Looking up region ID for 'Ярославль'...
  [ETG] /api/b2b/v3/search/multicomplete/ - 200 in 0.93s
  Found: Ярославль (RU), region_id=180349

Searching hotels in Ярославль...
  Dates: 2026-02-20 to 2026-02-23
  Currency: RUB, Limit: 1000


In [5]:
import pandas as pd

# Search hotels
results = client.search_hotels_by_region(
    region_id=region_id,
    checkin=CHECKIN_DATE,
    checkout=CHECKOUT_DATE,
    residency=RESIDENCY,
    guests=GUESTS,
    currency=CURRENCY,
    language=LANGUAGE,
    hotels_limit=LIMIT,
)

hotels: list[Hotel] = results.get("hotels", [])
total_hotels = results.get("total_hotels", len(hotels))

if not hotels:
    print("No hotels found for the given criteria.")
    df_hotels = pd.DataFrame()
else:
    print(f"Found {len(hotels)} hotels (total available: {total_hotels})\n")

    # Create DataFrame
    hotels_data = []
    for hotel in hotels:
        rates = hotel.get("rates", [])
        if rates:
            first_rate = rates[0]
            payment_types = first_rate.get("payment_options", {}).get("payment_types", [])
            price = float(payment_types[0].get("show_amount", 0)) if payment_types else None
            room_name = first_rate.get("room_name", "")
            meal = first_rate.get("meal", "")
        else:
            price = None
            room_name = ""
            meal = ""
        
        hotels_data.append({
            "hotel_id": hotel["id"],
            "hid": hotel["hid"],
            "price": price,
            "currency": CURRENCY,
            "room": room_name[:40] if room_name else "",
            "meal": meal,
        })

    df_hotels = pd.DataFrame(hotels_data)
    df_hotels = df_hotels.sort_values("price", ascending=True).reset_index(drop=True)
    df_hotels.index += 1  # Start from 1

    # Display settings
    pd.set_option("display.max_colwidth", 50)
    pd.set_option("display.max_rows", 100)

df_hotels

  [ETG] /api/b2b/v3/search/serp/region/ - 200 in 1.33s
Found 368 hotels (total available: 368)



Unnamed: 0,hotel_id,hid,price,currency,room,meal
1,gostepriimnaya_flat_2,13175199,7707.0,RUB,Четырёхместные апартаменты c 1 комнатой,nomeal
2,miniotel_iuzhnyi,8543975,7768.0,RUB,Двухместный номер Standard (общая ванная,nomeal
3,na_volodarskogo_61_flat,13107839,8565.0,RUB,Четырёхместные апартаменты Standard (пит,nomeal
4,panorama_apartments_29,9997786,8715.0,RUB,Апартаменты с 2 комнатами с балконом и с,nomeal
5,avtozavodskaya_75_apartments,9986664,8837.0,RUB,Студия (питание для детей не включено),nomeal
...,...,...,...,...,...,...
364,glemping_klever,9998037,75588.0,RUB,Четырёхместная Сфера семейная (питание д,nomeal
365,hotel_domy_podryadchika_terentyeva,11248010,83642.0,RUB,Гранд Делюкс,breakfast
366,minihotel_yaroslavskogo_delfinaria,6293137,89748.0,RUB,Двухместный люкс (двуспальная кровать) (,breakfast
367,ryibinskaya_24_tokyo_apartments,9989083,125554.0,RUB,Апартаменты Дуплекс с красивым видом из,nomeal


In [6]:
from services import filter_hotels_by_price

total_before = len(hotels)

# Apply price filter
hotels = filter_hotels_by_price(hotels, MIN_PRICE, MAX_PRICE)

# Remove very cheap hotels (< 30 EUR per night)
hotels = filter_hotels_by_price(hotels, min_price_per_night=30.0)

print(f"[hotels_found] Найдено {len(hotels)} отелей из {total_before} доступных"
      + (f" (фильтр: от {MIN_PRICE:.0f} до {MAX_PRICE:.0f} за ночь)" if MIN_PRICE or MAX_PRICE else ""))

[hotels_found] Найдено 290 отелей из 368 доступных (фильтр: от 3000 до 10000 за ночь)


In [7]:
from services import fetch_hotel_content

In [8]:
hotel_hids = [h["hid"] for h in hotels]

print(f"[content_progress] Загрузка контента для {len(hotel_hids)} отелей...")
content_map = fetch_hotel_content(client, hotel_hids, LANGUAGE)
print(f"[content_done] Загружен контент для {len(content_map)} из {len(hotel_hids)} отелей")

[content_progress] Загрузка контента для 290 отелей...
  [ETG] /api/content/v1/hotel_content_by_ids/ - 200 in 1.11s
  [ETG] /api/content/v1/hotel_content_by_ids/ - 200 in 0.54s
  [ETG] /api/content/v1/hotel_content_by_ids/ - 200 in 0.53s
[content_done] Загружен контент для 290 из 290 отелей


In [9]:
content_data = []
for hid, content in content_map.items():
    content_data.append({
        "hid": hid,
        "name": content.get("name", "")[:40],
        "stars": content.get("star_rating", 0),
        "kind": content.get("kind", ""),
        "address": content.get("address", "")[:50],
        "latitude": content.get("latitude"),
        "longitude": content.get("longitude"),
        "check_in": content.get("check_in_time", ""),
        "check_out": content.get("check_out_time", ""),
    })

df_content = pd.DataFrame(content_data)

# Merge with prices from df_hotels
df_full = df_hotels.merge(df_content, on="hid", how="left")
df_full = df_full[["hotel_id", "name", "stars", "kind", "price", "meal", "address"]]
df_full = df_full.sort_values("price", ascending=True).reset_index(drop=True)
df_full.index += 1

df_full

Unnamed: 0,hotel_id,name,stars,kind,price,meal,address
1,gostepriimnaya_flat_2,,,,7707.0,nomeal,
2,miniotel_iuzhnyi,,,,7768.0,nomeal,
3,na_volodarskogo_61_flat,,,,8565.0,nomeal,
4,panorama_apartments_29,,,,8715.0,nomeal,
5,avtozavodskaya_75_apartments,,,,8837.0,nomeal,
...,...,...,...,...,...,...,...
364,glemping_klever,,,,75588.0,nomeal,
365,hotel_domy_podryadchika_terentyeva,,,,83642.0,breakfast,
366,minihotel_yaroslavskogo_delfinaria,,,,89748.0,breakfast,
367,ryibinskaya_24_tokyo_apartments,,,,125554.0,nomeal,


In [10]:
from services import fetch_reviews, filter_reviews, HotelReviewsFiltered

print(f"[reviews_progress] Загрузка отзывов для {len(hotel_hids)} отелей...")
raw_reviews = fetch_reviews(client, hotel_hids, LANGUAGE)
reviews_map = filter_reviews(
    raw_reviews,
    max_age_years=REVIEWS_MAX_AGE_YEARS,
    reviews_per_segment=REVIEWS_PER_SEGMENT,
    neutral_threshold=NEUTRAL_RATING_THRESHOLD,
    negative_threshold=NEGATIVE_RATING_THRESHOLD,
)

total_raw = sum(len(revs) for revs in raw_reviews.values())
total_filtered = sum(len(rd["reviews"]) for rd in reviews_map.values())
hotels_with_reviews = len(reviews_map)
total_positive = sum(rd["positive_count"] for rd in reviews_map.values())
total_neutral = sum(rd["neutral_count"] for rd in reviews_map.values())
total_negative = sum(rd["negative_count"] for rd in reviews_map.values())

print(
    f"[reviews_summary] Обработано {total_raw} отзывов → {total_filtered} релевантных "
    f"({total_positive} позитивных, {total_neutral} нейтральных, {total_negative} негативных) "
    f"для {hotels_with_reviews} отелей"
)

[reviews_progress] Загрузка отзывов для 290 отелей...
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.93s
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.58s
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.49s
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.33s
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.37s
  [ETG] /api/content/v1/hotel_reviews_by_ids/ - 200 in 0.35s
[reviews_summary] Обработано 4720 отзывов → 3012 релевантных (2501 позитивных, 326 нейтральных, 185 негативных) для 275 отелей


In [11]:
# Create DataFrame with reviews summary
reviews_data = []
for hid, data in reviews_map.items():
    hotel_id = next((h["id"] for h in hotels if h["hid"] == hid), "")
    reviews_data.append({
        "hotel_id": hotel_id,
        "hid": hid,
        "total": data["total_reviews"],
        "positive": data["positive_count"],
        "neutral": data["neutral_count"],
        "negative": data["negative_count"],
    })

df_reviews = pd.DataFrame(reviews_data)
df_reviews = df_reviews.sort_values("total", ascending=False).reset_index(drop=True)
df_reviews.index += 1

# Merge with hotel info
df_reviews_full = df_reviews.merge(
    df_content[["hid", "name", "stars"]],
    on="hid",
    how="left"
)
df_reviews_full = df_reviews_full[["hotel_id", "name", "stars", "total", "positive", "neutral", "negative"]]


def show_reviews(hotel_id: str, segment: str = "all", limit: int = 5) -> None:
    """
    Show reviews for a hotel.
    
    Args:
        hotel_id: Hotel ID (e.g. 'rosewood_hong_kong')
        segment: 'positive', 'negative', 'neutral', or 'all'
        limit: Number of reviews to show per segment
    """
    hid = next((h["hid"] for h in hotels if h["id"] == hotel_id), None)
    if not hid:
        print(f"Hotel '{hotel_id}' not found")
        return
    
    data = reviews_map.get(hid)
    if not data:
        print(f"No reviews for hotel '{hotel_id}'")
        return
    
    hotel_name = content_map.get(hid, {}).get("name", hotel_id)
    print(f"{'='*60}")
    print(f"{hotel_name}")
    print(f"Total: {data['total_reviews']} | +{data['positive_count']} / ~{data['neutral_count']} / -{data['negative_count']}")
    print(f"{'='*60}\n")
    
    reviews = data["reviews"]
    
    def print_segment(name: str, filter_fn, limit: int):
        segment_reviews = [r for r in reviews if filter_fn(r)][:limit]
        if not segment_reviews:
            return
        print(f"--- {name} ({len(segment_reviews)}) ---")
        for r in segment_reviews:
            rating = r["rating"]
            date = r["created"][:10]
            lang = r.get("_lang", "?")
            plus = r.get("review_plus", "").strip()
            minus = r.get("review_minus", "").strip()
            print(f"\n[{rating}/10] {date} [{lang}]")
            if plus:
                print(f"  + {plus[:300]}")
            if minus:
                print(f"  - {minus[:300]}")
        print()
    
    if segment in ("all", "positive"):
        print_segment("POSITIVE", lambda r: r["rating"] >= NEUTRAL_RATING_THRESHOLD, limit)
    if segment in ("all", "neutral"):
        print_segment("NEUTRAL", lambda r: NEGATIVE_RATING_THRESHOLD <= r["rating"] < NEUTRAL_RATING_THRESHOLD, limit)
    if segment in ("all", "negative"):
        print_segment("NEGATIVE", lambda r: r["rating"] < NEGATIVE_RATING_THRESHOLD, limit)


df_reviews_full

Unnamed: 0,hotel_id,name,stars,total,positive,neutral,negative
0,city_house_city_house,Отель City House,0,358,30,30,14
1,skvoznoj_pereulok_7,Гостевой Дом Сквозной Переулок 7,0,333,30,15,3
2,kotorosl,Гостиница Которосль,3,218,30,26,7
3,hotel_dva_kryila,Отель Два крыла,0,213,30,14,8
4,milana_guest_house_4,Гостевой Дом Милана,0,163,30,15,11
...,...,...,...,...,...,...,...
270,v_tsentre_apartments_179,Апартаменты в центре,0,1,1,0,0
271,2k_apartamenty_apartcomfort_vishnyaki_5k2_apar...,Апартаменты 2к Апартаменты Apart-Comfort,0,1,1,0,0
272,avtozavodskaya_103_apartments,Апартаменты Сансет Семейная квартира для,0,1,1,0,0
273,sovremennaya_studiya_v_tsentre_yaroslavlya_flat,Квартира Современная студия в центре Яро,0,1,1,0,0


In [12]:
# Example: view reviews for a specific hotel
# show_reviews("four_seasons_st_petersburg")              # all segments, 5 per segment
# show_reviews("four_seasons_st_petersburg", "negative")  # only negative
# show_reviews("four_seasons_st_petersburg", "all", 10)   # all segments, 10 per segment

show_reviews("four_seasons_st_petersburg", limit=3)

Hotel 'four_seasons_st_petersburg' not found


In [13]:
class HotelCombined(Hotel, HotelContent):
    """Combined hotel data from search, content, and reviews."""
    reviews: HotelReviewsFiltered


def combine_hotel_data(
    hotels: list[Hotel],
    content_map: dict[int, HotelContent],
    reviews_map: dict[int, HotelReviewsFiltered],
) -> list[HotelCombined]:
    """Combine search results, content, and reviews."""
    print("Combining hotel data...")
    combined: list[HotelCombined] = []

    for hotel in hotels:
        hid = hotel["hid"]
        content = content_map.get(hid, {})
        reviews = reviews_map.get(hid, {
            "reviews": [],
            "total_reviews": 0,
            "positive_count": 0,
            "neutral_count": 0,
            "negative_count": 0,
        })

        combined.append({  # type: ignore[arg-type]
            **hotel,
            **content,
            "reviews": reviews,
        })

    print(f"  Combined {len(combined)} hotels")
    return combined

In [14]:
combined = combine_hotel_data(hotels, content_map, reviews_map)

Combining hotel data...
  Combined 290 hotels


In [15]:
import json

# Estimate token count for combined data
combined_json = json.dumps(combined, ensure_ascii=False)
char_count = len(combined_json)
# Rough estimate: ~4 chars per token for English, ~2-3 for Russian/mixed
estimated_tokens = char_count // 3

print(f"JSON size: {char_count:,} chars ({char_count / 1024 / 1024:.2f} MB)")
print(f"Estimated tokens: ~{estimated_tokens:,}")

JSON size: 8,054,867 chars (7.68 MB)
Estimated tokens: ~2,684,955


In [16]:
from services import presort_hotels, score_hotels

In [18]:
# Pre-sort and limit to top 100 for LLM scoring
top_hotels = presort_hotels(combined, reviews_map, limit=100)
prescores = [h.get("prescore", 0.0) for h in top_hotels]
print(f"Pre-sorted: {len(top_hotels)} hotels for LLM scoring (prescore {min(prescores):.0f}–{max(prescores):.0f})")

# Debug: show full prompts
DEBUG_SHOW_PROMPT = False

# Score hotels with progress
scoring_results = []
scoring_error = None

async for result in score_hotels(
    top_hotels,
    USER_PREFERENCES,
    currency=CURRENCY,
    min_price=MIN_PRICE,
    max_price=MAX_PRICE,
    batch_size=25,
):
    if result["type"] == "start":
        start = result["start"]
        print(f"\nScoring {start['total_hotels']} hotels in {start['total_batches']} batches (~{start['estimated_tokens']:,} tokens total)\n")
    elif result["type"] == "batch_start":
        bs = result["batch_start"]
        print(f"  → Batch {bs['batch']}/{bs['total_batches']}: {bs['hotels_in_batch']} hotels, ~{bs['estimated_tokens']:,} tokens")
        if DEBUG_SHOW_PROMPT:
            print(f"\n{'='*80}\nPROMPT (batch {bs['batch']}):\n{'='*80}\n{bs['prompt']}\n{'='*80}\n")
    elif result["type"] == "retry":
        retry = result["retry"]
        print(f"    ⚠ Retry {retry['attempt']}/{retry['max_attempts']}: {retry['error']}")
    elif result["type"] == "error":
        error = result["error"]
        scoring_error = error
        print(f"\n  ❌ ERROR [{error['error_type']}]: {error['message']}")
        break
    elif result["type"] == "progress":
        progress = result["progress"]
        print(f"  ✓ Batch {progress['batch']}/{progress['total_batches']} done ({progress['processed']}/{progress['total']} hotels)")
    elif result["type"] == "done":
        scoring_results = result["results"]

if scoring_error:
    print(f"\nScoring failed. Please check the error above.")
else:
    print(f"\nTotal scored: {len(scoring_results)} hotels")

Pre-sorted: 100 hotels for LLM scoring (prescore 54–75)
работает  claude-haiku-4-5-20251001
Клод работает  claude-haiku-4-5-20251001

Scoring 100 hotels in 100 batches (~134,765 tokens total)

  → Batch 1/100: 1 hotels, ~1,617 tokens


CancelledError: 

In [None]:
from typing import Any
from services import get_ostrovok_url


def display_top_hotels(
    results: list[dict[str, Any]],
    hotels_data: list[dict[str, Any]],
    city: str,
    country_code: str = "DE",
    top_n: int = 10,
) -> pd.DataFrame:
    """Display top N scored hotels with details and Ostrovok links."""
    # Build hotel_id -> hid mapping
    hid_map = {h.get("id", ""): h.get("hid", 0) for h in hotels_data}
    name_map = {h.get("id", ""): h.get("name", h.get("id", "")) for h in hotels_data}
    
    print(f"\n{'='*80}")
    print(f"TOP {top_n} HOTELS")
    print(f"{'='*80}\n")

    data = []
    for i, hotel in enumerate(results[:top_n], 1):
        hotel_id = hotel.get("hotel_id", "")
        hid = hid_map.get(hotel_id, 0)
        score = hotel.get("score", 0)
        name = name_map.get(hotel_id, hotel_id)
        reasons = hotel.get("top_reasons", [])
        penalties = hotel.get("score_penalties", [])
        
        url = get_ostrovok_url(hotel_id, hid, city, country_code) if hid else ""

        # Print detailed info
        print(f"{i}. {name}")
        print(f"   Score: {score}/100")
        if reasons:
            print(f"   + {'; '.join(reasons[:3])}")
        if penalties:
            print(f"   - {'; '.join(penalties[:5])}")
        print(f"   {url}")
        print()
        
        # Collect for DataFrame
        data.append({
            "name": name[:40],
            "score": score,
            "reasons": "; ".join(reasons[:2])[:80] if reasons else "",
            "penalties": "; ".join(penalties[:5])[:120] if penalties else "",
            "url": url,
        })
    
    df = pd.DataFrame(data)
    df.index = range(1, len(df) + 1)
    total_found = len(hotels_data)
    selected = min(top_n, len(results))
    print(f"Всего найдено {total_found} отелей на эти даты. ")
    print(f"Подобраны лучшие {selected} по вашим критериям.")
    return df


# Get country code from first hotel's region
first_hotel = combined[0] if combined else {}
region = first_hotel.get("region", {})
country_code = region.get("country_code", "DE")

pd.set_option("display.max_colwidth", 100)
display_top_hotels(scoring_results, combined, CITY, country_code, top_n=10)