In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
RAW_DATA_PATH = "../data/raw/dataset_POKEMON_CAPTURED_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_captures_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD DATA
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"[+] Loaded {len(df)} captures.")
else:
    print(f"[-] File not found: {RAW_DATA_PATH}. Run download_data.ipynb first.")
    df = pd.DataFrame()

if not df.empty:
    # JSON EXPANSION
    def clean_json(x):
        if isinstance(x, dict): return x
        try:
            return json.loads(x)
        except:
            return {}


    df['context_data'] = df['context_data'].apply(clean_json)

    # Flatten JSON (This automatically handles nested 'ivs' keys like 'ivs.HP_IV')
    df_context = pd.json_normalize(df['context_data'])

    # Remove duplicated columns (world, biome, timestamp, playerUuid)
    cols_to_drop = df_context.columns.intersection(df.columns)
    if not cols_to_drop.empty:
        df_context = df_context.drop(columns=cols_to_drop)

    # Join
    df_final = df.join(df_context).drop(columns=['context_data'])

    print("[+] JSON expanded successfully.")
    display(df_final.head(3))

In [None]:
# FEATURE ENGINEERING (IVs Calculation)

if not df_final.empty:
    # A. Calculate Total IV Percentage
    # Max IV per stat is 31. Total stats = 6. Max Total = 186.
    # The columns created by json_normalize usually look like 'ivs.HP_IV', 'ivs.ATTACK_IV', etc.

    # Find columns that start with 'ivs.'
    iv_cols = [col for col in df_final.columns if col.startswith('ivs.')]

    if iv_cols:
        print(f"[INFO] Found IV columns: {iv_cols}")
        # Sum all IV columns row by row
        df_final['iv_total'] = df_final[iv_cols].sum(axis=1)
        # Calculate Percentage (0 to 100)
        df_final['iv_percentage'] = (df_final['iv_total'] / 186) * 100
    else:
        print("[-] Warning: No IV columns found. Check JSON structure.")

    # B. Shiny Flag (Ensure it's integer 1/0)
    if 'shiny' in df_final.columns:
        df_final['is_shiny'] = df_final['shiny'].apply(lambda x: 1 if x else 0)

    # C. Server ID (One-Hot)
    if 'server_id' in df_final.columns:
        server_dummies = pd.get_dummies(df_final['server_id'], prefix='server')
        df_final = pd.concat([df_final, server_dummies], axis=1)

    print("[+] Feature Engineering complete.")
    display(df_final[['species', 'is_shiny', 'iv_percentage', 'ballUsed']].head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_final.empty:
    # Graph 1: IV Distribution (Are players finding good stuff?)
    if 'iv_percentage' in df_final.columns:
        plt.figure(figsize=(8, 5))
        sns.histplot(df_final['iv_percentage'], bins=20, kde=True, color='purple')
        plt.title('Distribution of Pokémon Quality (IV %)')
        plt.xlabel('IV Percentage (100% = Perfect)')
        plt.axvline(x=50, color='r', linestyle='--')  # Average line
        plt.show()

    # Graph 2: Top 10 Captured Species
    plt.figure(figsize=(10, 6))
    top_species = df_final['species'].value_counts().head(10).index
    sns.countplot(
        y='species',
        data=df_final[df_final['species'].isin(top_species)],
        order=top_species,
        palette='crest',
        hue='species',
        legend=False
    )
    plt.title('Top 10 Most Captured Pokémon')
    plt.show()

    # Graph 3: Shiny Rate
    if 'is_shiny' in df_final.columns:
        shiny_count = df_final['is_shiny'].sum()
        total_count = len(df_final)
        print(f"[INFO] Shiny Pokémon Captured: {shiny_count} / {total_count} ({shiny_count / total_count * 100:.2f}%)")

In [None]:
# SAVE CLEAN DATASET

if not df_final.empty:
    cols_to_keep = [
        'species',
        'level',
        'nature',
        'ability',
        'is_shiny',
        'iv_percentage',
        'ballUsed',
        'world',
        'biome'
    ]
    # Add IV columns if you want detailed stats
    cols_to_keep.extend([col for col in df_final.columns if col.startswith('ivs.')])

    # Add server columns
    cols_to_keep.extend([col for col in df_final.columns if 'server_' in col])

    df_export = df_final[cols_to_keep].copy()

    df_export.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean captures dataset saved to: {CLEAN_DATA_PATH}")
    display(df_export.head())