In [None]:
import pandas as pd
import json
import time
import os
import random
from pytrends.request import TrendReq
from tqdm.notebook import tqdm
from datetime import datetime

# --- 1. SETTINGS ---
pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25))

# Target Countries List (All)
ALL_COUNTRIES = [
    'US', 'GB', 'DE', 'FR', 'TR', 'BR', 'IN', 'JP', 'KR', 'RU', 
    'IT', 'CA', 'AU', 'ES', 'MX', 'ID', 'NL', 'SA', 'CH', 'SE',
    'PL', 'BE', 'TH', 'IE', 'AT', 'SG', 'AR', 'NO', 'ZA', 'EG',
    'DK', 'MY', 'PH', 'VN', 'FI', 'AE', 'PT', 'CO', 'NZ', 'GR',
    'PK', 'UA', 'CL', 'RO', 'CZ', 'HU', 'IL', 'HK', 'TW', 'NG',
    'KE', 'BG', 'HR', 'SI', 'SK', 'LT', 'RS', 'UY', 'VE', 'PE',
    'EC', 'GH', 'MA', 'LK', 'MM', 'BD', 'EE', 'LV', 'TN', 'BO'
]

# --- 2. LOAD JSON ---
try:
    with open('data/raw/keywords_FINAL_2025.json', 'r', encoding='utf-8') as f:
        keyword_data = json.load(f)
    print("JSON file loaded.")
except FileNotFoundError:
    print("ERROR: JSON file not found!")

# ID and keyword mapping
all_keywords = {}
for cat in keyword_data['categories']:
    for sg in cat['sub_groups']:
        for q in sg['queries']:
            if 'topic_id' in q and q['topic_id'].startswith('/'):
                all_keywords[q['topic_id']] = q['label']

keyword_ids = list(all_keywords.keys())

# --- 3. RESUME FROM LAST POINT (Memory Module) ---
checkpoint_file = "data/raw/checkpoint_dataset.csv"
completed_countries = []

if os.path.exists(checkpoint_file):
    try:
        # Read only the 'Country' column to find completed ones
        # chunksize=1000 allows fast reading of large files
        chunks = pd.read_csv(checkpoint_file, usecols=['Country'], chunksize=1000)
        for chunk in chunks:
            completed_countries.extend(chunk['Country'].unique().tolist())
        
        # Remove duplicates
        completed_countries = list(set(completed_countries))
        print(f" Memory loaded: {len(completed_countries)} countries already completed.")
    except Exception as e:
        print(f"Checkpoint read warning (may be harmless): {e}")

# Select only unfinished countries
TARGET_COUNTRIES = [c for c in ALL_COUNTRIES if c not in completed_countries]

if not TARGET_COUNTRIES:
    print(" All countries have already been processed. Task completed!")
else:
    print(f" Remaining {len(TARGET_COUNTRIES)} countries to be processed: {TARGET_COUNTRIES}")

# --- 4. DATA FETCH ENGINE (PATIENT TURTLE MODE) ---
def fetch_data(countries, keyword_ids):
    global pytrends
    chunk_size = 5 
    keyword_chunks = [keyword_ids[i:i + chunk_size] for i in range(0, len(keyword_ids), chunk_size)]
    SAFE_TIMEFRAME = '2022-01-01 2024-12-30'
    
    # Create headers if the file does not exist
    if not os.path.exists(checkpoint_file):
        dummy_cols = ['Country', 'date'] + list(all_keywords.values())
        pd.DataFrame(columns=dummy_cols).to_csv(checkpoint_file, index=False)

    for country in tqdm(countries, desc="Remaining Countries"):
        country_df = pd.DataFrame()
        # Take a long random breath before starting each country
        time.sleep(random.randint(5, 10))
        
        for chunk in keyword_chunks:
            basarili = False
            deneme_sayisi = 0
            
            # Persistent retry loop
            while not basarili: 
                try:
                    pytrends.build_payload(chunk, timeframe=SAFE_TIMEFRAME, geo=country)
                    data = pytrends.interest_over_time()
                    
                    if not data.empty:
                        data = data.drop(columns=['isPartial'], errors='ignore')
                        country_df = pd.concat([country_df, data], axis=1)
                    
                    basarili = True 
                    # HUMAN BEHAVIOR SIMULATION: wait randomly between 12 and 20 seconds between queries
                    wait_human = random.randint(12, 20)
                    time.sleep(wait_human)
                    
                except Exception as e:
                    if "429" in str(e):
                        # Rate limit hit. Since IP is static, we must wait LONG.
                        # Wait time: 15 minutes (900 seconds) + increase per retry
                        wait_time = 900 + (deneme_sayisi * 300)
                        now = datetime.now().strftime("%H:%M:%S")
                        print(f"\n [{now}] Rate limit hit (429). Taking a {int(wait_time/60)} minute break...")
                        print("Do not shut down the computer, I will wait and continue.")
                        
                        time.sleep(wait_time)
                        deneme_sayisi += 1
                        
                        # Refresh Pytrends instance
                        pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25))
                    else:
                        print(f"\n Critical Error ({country} - {chunk}): {e}")
                        # If not 429, skip this keyword group and exit loop
                        basarili = True 
        
        if not country_df.empty:
            country_df['Country'] = country
            country_df.reset_index(inplace=True) 
            country_df.rename(columns=all_keywords, inplace=True)
            
            # INSTANT SAVE (Append Mode)
            # Header=False because headers already exist
            country_df.to_csv(checkpoint_file, mode='a', header=False, index=False)
            # print(f" {country} completed.")

# --- 5. START ---
if len(TARGET_COUNTRIES) > 0:
    print("Process started... This may take a long time, sit back and relax.")
    fetch_data(TARGET_COUNTRIES, keyword_ids)
    print("\n DONE! Check your checkpoint_dataset.csv file.")


JSON file loaded.
 Memory loaded: 71 countries already completed.
 All countries have already been processed. Task completed!



During the massive data collection process (over 11,000 rows), some temporary rows might have been recorded with missing country labels due to connection interruptions or API rate limits. 

In this step, we perform a final sanity check to remove these invalid rows (`NaN` values in the 'Country' column) and save the pristine dataset for the analysis phase.

In [None]:
import pandas as pd

# 1. Load the checkpoint dataset
df = pd.read_csv('checkpoint_dataset.csv')

# 2. Remove garbage rows (Drop rows where 'Country' is missing/NaN)
df_clean = df.dropna(subset=['Country'])

# 3. Save the final, clean version
df_clean.to_csv('data/raw/final_proje_dataset_CLEAN.csv', index=False)

print(f"Cleaning complete! New file created: final_proje_dataset_CLEAN.csv")
print(f"Final Row Count: {len(df_clean)}")

  df = pd.read_csv('checkpoint_dataset.csv')


Cleaning complete! New file created: final_proje_dataset_CLEAN.csv
Final Row Count: 11060
