In [6]:
import json, time, random
from pathlib import Path

import pandas as pd
from pytrends.request import TrendReq
from pytrends.exceptions import TooManyRequestsError

# 1) Project root'u sabitle
PROJECT_ROOT = Path.cwd()
if (PROJECT_ROOT / "keywords_FINAL_2025.json").exists() is False and PROJECT_ROOT.name.lower() == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

KEYWORDS_JSON = PROJECT_ROOT / "keywords_FINAL_2025.json"

# 2) Output klasörleri
OUT_DIR = PROJECT_ROOT / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_CSV = OUT_DIR / "smoke_test.csv"
ERR_CSV = OUT_DIR / "smoke_test_errors.csv"

# 3) Smoke test parametreleri (429 azaltmak için)
COUNTRIES = ["PL", "TR", "BR"]   # sonra artırırsın
TIMEFRAME = "today 3-m"         # 12-m yerine 3-m (test için ideal)
BASE_SLEEP = 6                  # her istek arası minimum bekleme

print("PROJECT_ROOT:", PROJECT_ROOT)
print("KEYWORDS_JSON exists:", KEYWORDS_JSON.exists())
print("OUT_CSV:", OUT_CSV)


PROJECT_ROOT: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity
KEYWORDS_JSON exists: True
OUT_CSV: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity\data\processed\smoke_test.csv


In [7]:
with open(KEYWORDS_JSON, "r", encoding="utf-8") as f:
    cfg = json.load(f)

def get_group_queries(cfg, category_name, group_name):
    for cat in cfg.get("categories", []):
        if cat.get("category_name") == category_name:
            for grp in cat.get("sub_groups", []):
                if grp.get("group_name") == group_name:
                    return grp.get("queries", [])
    return []

queries = get_group_queries(cfg, "FINANCE", "Inflation")
if not queries:
    raise ValueError("FINANCE -> Inflation bulunamadı. JSON içindeki isimleri kontrol et.")

topics = [(q["label"], q["topic_id"]) for q in queries]

# Smoke testte maksimum 4 topic ile kalalım
topics = topics[:4]

print("Selected topics:")
for label, tid in topics:
    print("-", label, tid)


Selected topics:
- Inflation /m/09jx2
- Cost of Living /m/01fss9
- Price Hike /g/11mcmf67yl
- Shrinkflation /m/010w1kk1


In [8]:
pytrends = TrendReq(hl="en-US", tz=180)

def fetch_interest_over_time(topic_ids, geo, timeframe):
    pytrends.build_payload(
        kw_list=topic_ids,
        cat=0,
        timeframe=timeframe,
        geo=geo,
        gprop=""
    )
    return pytrends.interest_over_time()

def fetch_with_retry(topic_ids, geo, timeframe, max_retries=6, base_sleep=8):
    for attempt in range(max_retries):
        try:
            return fetch_interest_over_time(topic_ids, geo=geo, timeframe=timeframe)
        except TooManyRequestsError:
            wait = base_sleep * (2 ** attempt) + random.uniform(0, 2.0)
            print(f"[429] geo={geo} retry {attempt+1}/{max_retries} -> sleep {wait:.1f}s")
            time.sleep(wait)
    raise TooManyRequestsError(f"Max retries exceeded for geo={geo}")


In [9]:
seen = set()
if OUT_CSV.exists():
    old = pd.read_csv(OUT_CSV)
    # (geo, topic_id, timeframe) olarak işaretle
    if set(["geo","topic_id","timeframe"]).issubset(old.columns):
        seen = set(zip(old["geo"], old["topic_id"], old["timeframe"]))
        print("Checkpoint loaded:", len(seen))
    else:
        print("OUT_CSV var ama kolonlar beklenenden farklı. Skip checkpoint.")
else:
    print("No checkpoint yet.")


Checkpoint loaded: 8


In [10]:
out_csv = OUT_DIR / "smoke_test.csv"
err_csv = OUT_DIR / "smoke_test_errors.csv"

result.to_csv(out_csv, index=False, encoding="utf-8")
err_df.to_csv(err_csv, index=False, encoding="utf-8")

print("Saved:", out_csv)
print("Saved:", err_csv)


Saved: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity\data\processed\smoke_test.csv
Saved: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity\data\processed\smoke_test_errors.csv


In [None]:
records = []
errors = []

# existing output varsa yeni kayıtları ona ekleyeceğiz
existing = pd.read_csv(OUT_CSV) if OUT_CSV.exists() else pd.DataFrame()

id_to_label = {tid: lbl for (lbl, tid) in topics}

for geo in COUNTRIES:
    # Bu geo için eksik topic'leri bul
    missing_ids = [tid for (lbl, tid) in topics if (geo, tid, TIMEFRAME) not in seen]
    missing_labels = [id_to_label[tid] for tid in missing_ids]

    print(f"\n=== GEO: {geo} ===")
    print("Missing:", missing_labels)

    if not missing_ids:
        print("Nothing to fetch. (Already collected)")
        continue

    try:
        df = fetch_with_retry(missing_ids, geo=geo, timeframe=TIMEFRAME)

        if df is None or df.empty:
            errors.append({"geo": geo, "batch": missing_labels, "error": "Empty dataframe"})
        else:
            if "isPartial" in df.columns:
                df = df.drop(columns=["isPartial"])

            df_long = (
                df.reset_index()
                .melt(id_vars=["date"], var_name="topic_id", value_name="interest")
            )
            df_long["label"] = df_long["topic_id"].map(id_to_label)
            df_long["geo"] = geo
            df_long["timeframe"] = TIMEFRAME

            # Append + save immediately (crash-proof)
            combined = pd.concat([existing, df_long], ignore_index=True)
            combined.to_csv(OUT_CSV, index=False, encoding="utf-8")
            existing = combined

            # Update seen
            for tid in missing_ids:
                seen.add((geo, tid, TIMEFRAME))

            print(f"Saved rows so far: {len(existing)}")

        # normal sleep (429 olmasa da)
        time.sleep(BASE_SLEEP + random.uniform(0, 1.5))

    except Exception as e:
        errors.append({"geo": geo, "batch": missing_labels, "error": repr(e)})
        pd.DataFrame(errors).to_csv(ERR_CSV, index=False, encoding="utf-8")
        print("Error saved:", repr(e))
        time.sleep(BASE_SLEEP + random.uniform(0, 1.5))

# Save errors (if any)
err_df = pd.DataFrame(errors)
err_df.to_csv(ERR_CSV, index=False, encoding="utf-8")

print("\nDONE")
print("OUT:", OUT_CSV, "exists:", OUT_CSV.exists())
print("ERR:", ERR_CSV, "exists:", ERR_CSV.exists())
display(existing.head())
display(err_df.head())



=== GEO: PL ===
Missing: ['Inflation', 'Cost of Living', 'Price Hike', 'Shrinkflation']


  df = df.fillna(False)


Saved rows so far: 792

=== GEO: TR ===
Missing: ['Inflation', 'Cost of Living', 'Price Hike', 'Shrinkflation']


  df = df.fillna(False)


Saved rows so far: 1160

=== GEO: BR ===
Missing: ['Inflation', 'Cost of Living', 'Price Hike', 'Shrinkflation']


  df = df.fillna(False)


Saved rows so far: 1528

DONE
OUT: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity\data\processed\smoke_test.csv exists: True
ERR: c:\Users\LENOVO\Desktop\Cultural Similarity\Cultural-Similarity\data\processed\smoke_test_errors.csv exists: True


Unnamed: 0,date,topic_id,interest,label,geo,timeframe
0,2024-12-15,/m/09jx2,55,Inflation,PL,today 12-m
1,2024-12-22,/m/09jx2,43,Inflation,PL,today 12-m
2,2024-12-29,/m/09jx2,73,Inflation,PL,today 12-m
3,2025-01-05,/m/09jx2,81,Inflation,PL,today 12-m
4,2025-01-12,/m/09jx2,100,Inflation,PL,today 12-m
