In [None]:
import pandas as pd
import json
import time
from pytrends.request import TrendReq
from tqdm.notebook import tqdm

# --- 1. SETTINGS ---
# Google Trends Connection
pytrends = TrendReq(hl='en-US', tz=360)

TARGET_COUNTRIES = [
    'US', 'GB', 'DE', 'FR', 'TR', 'BR', 'IN', 'JP', 'KR', 'RU', 
    'IT', 'CA', 'AU', 'ES', 'MX', 'ID', 'NL', 'SA', 'CH', 'SE',
    'PL', 'BE', 'TH', 'IE', 'AT', 'SG', 'AR', 'NO', 'ZA', 'EG',
    'DK', 'MY', 'PH', 'VN', 'FI', 'AE', 'PT', 'CO', 'NZ', 'GR',
    'PK', 'UA', 'CL', 'RO', 'CZ', 'HU', 'IL', 'HK', 'TW', 'NG',
    'KE', 'BG', 'HR', 'SI', 'SK', 'LT', 'RS', 'UY', 'VE', 'PE',
    'EC', 'GH', 'MA', 'LK', 'MM', 'BD', 'EE', 'LV', 'TN', 'BO'
]

# --- 2. LOAD JSON ---
try:
    with open('keywords_FINAL_2025.json', 'r', encoding='utf-8') as f:
        keyword_data = json.load(f)
    print("JSON loaded successfully")
except FileNotFoundError:
    print("ERROR: keywords_FINAL_2025.json could not be found!")

# Match IDs and Tags
all_keywords = {}
for cat in keyword_data['categories']:
    for sg in cat['sub_groups']:
        for q in sg['queries']:
            # Only include those with an ID (starting with /m/ or /g/)
            if 'topic_id' in q and q['topic_id'].startswith('/'):
                all_keywords[q['topic_id']] = q['label']

keyword_ids = list(all_keywords.keys())
print(f"Total number of topics to be scanned: {len(keyword_ids)}")

# --- 3. DATA EXTRACTION ENGINE ---
def fetch_data(countries, keyword_ids):
    full_dataset = []
    chunk_size = 5  # Google allows 5 words at a time.

     # Divide the words into groups of five.
    keyword_chunks = [keyword_ids[i:i + chunk_size] for i in range(0, len(keyword_ids), chunk_size)]
    
    # Last 3 years, Format: 'YYYY-MM-DD YYYY-MM-DD'
    SAFE_TIMEFRAME = '2022-01-01 2024-12-30'
    
    for country in tqdm(countries, desc="Scanning countries..."):
        country_df = pd.DataFrame()
        
        for chunk in keyword_chunks:
            try:
                # payload oluştur
                pytrends.build_payload(chunk, timeframe=SAFE_TIMEFRAME, geo=country)
                data = pytrends.interest_over_time()
                
                if not data.empty:
                    data = data.drop(columns=['isPartial'], errors='ignore')
                    country_df = pd.concat([country_df, data], axis=1)
                
                time.sleep(2) #  Waiting for 2 sec so Google does not block it.
                
            except Exception as e:
                # HATA AYIKLAMA: Hangi grupta patladığını görelim
                print(f"\n HATA OLUŞTU!")
                print(f"Ülke: {country}")
                print(f"Hatalı Grup: {chunk}")
                print(f"Hata Kodu: {e}")
                
                # Eğer 400 hatası ise muhtemelen bu grupta bozuk bir ID var
                if "400" in str(e):
                    print("İPUCU: Bu gruptaki kelimelerden birinin ID'si hatalı olabilir!")
                    print("Lütfen JSON dosyasındaki bu kelimeleri kontrol et.")
                    # Döngüyü kırma, diğer gruplara devam et
                
                time.sleep(60) #If any error occurs, sleep 1 min
        
        if not country_df.empty:
            country_df['Country'] = country
            country_df.reset_index(inplace=True)  #Date column
            full_dataset.append(country_df)
            
            # Save file after every country data 
            pd.concat(full_dataset).to_csv("checkpoint_dataset.csv", index=False)

    if full_dataset:
        return pd.concat(full_dataset)
    else:
        return pd.DataFrame() # Boş dönerse hata vermesin

# --- 4. START ---
print("Starting fetching datas...")
final_df = fetch_data(TARGET_COUNTRIES, keyword_ids)

# Fix column names (ID -> Word)
final_df.rename(columns=all_keywords, inplace=True)

# Save the dataset
final_df.to_csv("final_project_dataset.csv", index=False)
print(f"\nData collection completed!")
print(f"Final row count: {len(final_df)}")
print(f"File name: final_project_dataset.csv")

JSON loaded successfully
Total number of topics to be scanned: 178
Starting fetching datas...


Ülkeler Taranıyor:   0%|          | 0/70 [00:00<?, ?it/s]


 HATA OLUŞTU!
Ülke: US
Hatalı Grup: ['/m/05p0rrx', '/m/0108bn2x', '/g/11tskkw5c9', '/g/11sfdkgmfn', '/g/11hcz1r4wl']
Hata Kodu: The request failed: Google returned a response with code 429
