---
title: "Data Collection"
format:
    html: 
        code-fold: false
---

{{< include instructions.qmd >}} 


{{< include overview.qmd >}} 

{{< include methods.qmd >}} 

## Data Collection from Riot Games API

Purpose: In the following code, we be collecting data from the Riot Games API using Python. 

Data is collected for a list of summoner names, retrieving their account information, match history, and detailed match data. The collected data is then organized into Pandas DataFrames for data cleansing and EDA.

Below is the high level code structure:
#import packages
Config:
    Config abs file paths:
    Config API

Constraints of the script:
my reigion is limtied to NA
API has run limitations


Output is located in data/raw-data


```python

In [1]:
#import packages
import json
import time
import random
from pathlib import Path
from typing import List, Dict, Any, Optional
import requests
import pandas as pd
from urllib.parse import quote


#Riot games policy is that key shouldnt be public. on github its PLACEHOLDER
RIOT_API_KEY = "RGAPI-5e15b534-2c99-4fde-87c6-a85ed41f55dd"

#config where data will go (use abs path .ipynb)
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
RAW_MATCH_DIR = DATA_DIR / "raw-data"
DATA_DIR.mkdir(parents=True, exist_ok=True)
RAW_MATCH_DIR.mkdir(parents=True, exist_ok=True)

#Set routing
PLATFORM_REGION = "na1"       # for league & summoner
REGIONAL_ROUTING = "americas" # for match-v1

#Ranked standard TFT queue id (NOT normals, not Hyper Roll, not Double Up)
RANKED_TFT_QUEUE_ID = 1100

# Sampling controls
#looking at both master and grandmaster
MAX_PLAYERS_PER_TIER = 100   
MATCHES_PER_PUUID = 20      
MAX_RETRIES = 4   
BACKOFF_FACTOR = 1.5


# -----------------------------
# Low-level request helper
# -----------------------------

def riot_get(
    url: str,
    api_key: str,
    params: Optional[Dict[str, Any]] = None,
    max_retries: int = MAX_RETRIES,
    backoff_factor: float = BACKOFF_FACTOR,
) -> Any:
    """Wrapper around requests.get with Riot auth + basic retry."""
    headers = {"X-Riot-Token": api_key}
    params = params or {}

    for attempt in range(max_retries):
        resp = requests.get(url, headers=headers, params=params, timeout=15)
        
        # Rate limited
        if resp.status_code == 429:
            retry_after = resp.headers.get("Retry-After", "1")
            try:
                retry_after = int(retry_after)
            except ValueError:
                retry_after = 1
            sleep_time = retry_after + 1
            print(f"[429] Rate limited. Sleeping {sleep_time} seconds...")
            time.sleep(sleep_time)
            continue

        # 5xx server errors
        if 500 <= resp.status_code < 600:
            sleep_time = backoff_factor * (attempt + 1)
            print(f"[{resp.status_code}] Server error. Retry {attempt+1}/{max_retries}, sleeping {sleep_time:.1f}s")
            time.sleep(sleep_time)
            continue

        # Other errors: raise
        if not resp.ok:
            raise RuntimeError(f"Request failed {resp.status_code}: {resp.text}")

        # Success
        return resp.json()

    raise RuntimeError(f"Failed after {max_retries} retries: {url}")


# -----------------------------
# League & summoners (NA1)
# -----------------------------

def get_league_list(tier: str, api_key: str, platform_region: str = PLATFORM_REGION) -> Dict[str, Any]:
    """Get full league list for a TFT tier."""
    tier = tier.lower()
    if tier not in {"challenger", "grandmaster", "master"}:
        raise ValueError("tier must be one of {'challenger', 'grandmaster', 'master'}")

    url = f"https://{platform_region}.api.riotgames.com/tft/league/v1/{tier}"
    return riot_get(url, api_key)


def extract_summoner_names(league_json: Dict[str, Any], max_players: Optional[int] = None) -> List[str]:
    """From league JSON, extract summonerName for entries with wins > 0."""
    entries = league_json.get("entries", [])
    names = [e["summonerName"] for e in entries if e.get("wins", 0) > 0]

    random.shuffle(names)
    if max_players is not None and len(names) > max_players:
        names = names[:max_players]

    return names


def get_puuid_for_summoner(name: str, api_key: str, platform_region: str = PLATFORM_REGION) -> Optional[str]:
    """Get PUUID for a summoner name on a given platform."""
    encoded_name = quote(name)
    url = f"https://{platform_region}.api.riotgames.com/tft/summoner/v1/summoners/by-name/{encoded_name}"

    try:
        data = riot_get(url, api_key)
        return data.get("puuid")
    except Exception as e:
        print(f"[WARN] Failed to get PUUID for '{name}': {e}")
        return None


# -----------------------------
# Matches (AMERICAS routing)
# -----------------------------

def get_match_ids_for_puuid(
    puuid: str,
    api_key: str,
    regional_routing: str = REGIONAL_ROUTING,
    count: int = MATCHES_PER_PUUID,
) -> List[str]:
    """Get recent TFT match IDs for a given PUUID."""
    url = f"https://{regional_routing}.api.riotgames.com/tft/match/v1/matches/by-puuid/{puuid}/ids"
    params = {"start": 0, "count": count}
    try:
        match_ids = riot_get(url, api_key, params=params)
        if not isinstance(match_ids, list):
            print(f"[WARN] Unexpected match_ids type for puuid={puuid}: {type(match_ids)}")
            return []
        return match_ids
    except Exception as e:
        print(f"[WARN] Failed to get match IDs for puuid={puuid}: {e}")
        return []


def get_match_detail(
    match_id: str,
    api_key: str,
    regional_routing: str = REGIONAL_ROUTING
) -> Optional[Dict[str, Any]]:
    """Get full match JSON for a match ID. Returns None on failure."""
    url = f"https://{regional_routing}.api.riotgames.com/tft/match/v1/matches/{match_id}"
    try:
        return riot_get(url, api_key)
    except Exception as e:
        print(f"[WARN] Failed to fetch match {match_id}: {e}")
        return None


# -----------------------------
# Participants index builder
# -----------------------------

def participants_from_match(match_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extract a LIGHT participants-level record from match JSON.
    Deeper feature engineering goes in the EDA/cleaning script.
    """
    metadata = match_data.get("metadata", {})
    info = match_data.get("info", {})

    match_id = metadata.get("match_id")
    game_version = info.get("game_version")
    queue_id = info.get("queue_id")
    tft_set_number = info.get("tft_set_number")
    game_length = info.get("game_length")
    game_datetime = info.get("game_datetime")  # may be None

    participants = info.get("participants", [])
    records = []

    for p in participants:
        row = {
            "match_id": match_id,
            "game_version": game_version,
            "queue_id": queue_id,
            "tft_set_number": tft_set_number,
            "game_length": game_length,
            "game_datetime": game_datetime,
            "puuid": p.get("puuid"),
            "placement": p.get("placement"),
            "level": p.get("level"),
            "gold_left": p.get("gold_left"),
            "last_round": p.get("last_round"),
            "time_eliminated": p.get("time_eliminated"),
            "players_eliminated": p.get("players_eliminated"),
            "num_traits": len(p.get("traits", [])),
            "num_units": len(p.get("units", [])),
            "num_augments": len(p.get("augments", [])),
        }
        records.append(row)

    return records

def collect_tft_ranked_data() -> None:
    api_key = RIOT_API_KEY
    if not api_key or "PASTE_YOUR_RIOT_API_KEY_HERE" in api_key:
        raise ValueError("Please paste your actual Riot API key into RIOT_API_KEY.")

    print("Using Riot API key.")

    # 1) Get league players (Challenger + Grandmaster)
    print("Fetching Challenger and Grandmaster leagues from NA1...")
    chall = get_league_list("challenger", api_key)
    gm = get_league_list("grandmaster", api_key)

    chall_names = extract_summoner_names(chall, max_players=MAX_PLAYERS_PER_TIER)
    gm_names = extract_summoner_names(gm, max_players=MAX_PLAYERS_PER_TIER)
    all_names = list(dict.fromkeys(chall_names + gm_names))  # dedupe

    print(f"Collected {len(all_names)} unique high-elo summoner names.")

    # 2) Convert summoner names to PUUIDs
    puuids = []
    for name in all_names:
        puuid = get_puuid_for_summoner(name, api_key)
        if puuid:
            puuids.append(puuid)

    puuids = list(dict.fromkeys(puuids))
    print(f"Resolved {len(puuids)} unique PUUIDs.")

    # 3) Get match IDs for each PUUID
    all_match_ids: List[str] = []
    for i, puuid in enumerate(puuids, start=1):
        print(f"[{i}/{len(puuids)}] Fetching match IDs for PUUID {puuid[:8]}...")
        ids = get_match_ids_for_puuid(puuid, api_key, count=MATCHES_PER_PUUID)
        all_match_ids.extend(ids)
        time.sleep(0.1)

    all_match_ids = list(dict.fromkeys(all_match_ids))
    print(f"Collected {len(all_match_ids)} unique match IDs before filtering.")

    # 4) Fetch match details, filter to ranked standard, save raw JSON + index rows
    participants_rows: List[Dict[str, Any]] = []

    for idx, match_id in enumerate(all_match_ids, start=1):
        print(f"[{idx}/{len(all_match_ids)}] Fetching match {match_id}...")
        match_data = get_match_detail(match_id, api_key)
        if match_data is None:
            continue

        info = match_data.get("info", {})
        queue_id = info.get("queue_id")

        #Keep only ranked TFT standard
        if queue_id != RANKED_TFT_QUEUE_ID:
            print(f"  - Skipping match {match_id} (queue_id={queue_id})")
            continue

        #Save raw JSON to data/raw-data/{match_id}.json
        raw_path = RAW_MATCH_DIR / f"{match_id}.json"
        with raw_path.open("w") as f:
            json.dump(match_data, f)

        rows = participants_from_match(match_data)
        participants_rows.extend(rows)
        time.sleep(0.1)

    if participants_rows:
        participants_df = pd.DataFrame(participants_rows)
        out_path = DATA_DIR / "participants_index_raw.csv"
        participants_df.to_csv(out_path, index=False)
        print(f"Saved participants index with {len(participants_df)} rows to {out_path}")
    else:
        print("No ranked TFT matches collected. Check your key, region, or limits.")
