In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
RAW_DATA_PATH = "../data/raw/dataset_POKEMON_RELEASED_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_released_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD DATA
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"[+] Loaded {len(df)} release events.")
else:
    print(f"[-] File not found: {RAW_DATA_PATH}. Run download_data.ipynb first.")
    df = pd.DataFrame()

if not df.empty:
    # JSON EXPANSION
    def clean_json(x):
        if isinstance(x, dict): return x
        try:
            return json.loads(x)
        except:
            return {}


    df['context_data'] = df['context_data'].apply(clean_json)

    # Flatten JSON
    df_context = pd.json_normalize(df['context_data'])

    # Remove duplicated columns
    cols_to_drop = df_context.columns.intersection(df.columns)
    if not cols_to_drop.empty:
        df_context = df_context.drop(columns=cols_to_drop)

    # Join
    df_final = df.join(df_context).drop(columns=['context_data'])

    print("[+] JSON expanded successfully.")
    display(df_final.head(3))

In [None]:
# FEATURE ENGINEERING (Why did they throw it away?)

if not df_final.empty:
    # A. Calculate IV Percentage (Quality)
    # We expect released pokemon to have LOWER IVs on average than captured ones
    iv_cols = [col for col in df_final.columns if col.startswith('ivs.')]

    if iv_cols:
        df_final['iv_total'] = df_final[iv_cols].sum(axis=1)
        df_final['iv_percentage'] = (df_final['iv_total'] / 186) * 100
        print("[+] IVs calculated.")

    # B. Shiny Flag
    # Who releases a shiny?! Maybe by mistake, or if they have too many.
    if 'shiny' in df_final.columns:
        df_final['is_shiny'] = df_final['shiny'].apply(lambda x: 1 if x else 0)

    # C. Time Held (Did they keep it for long?)
    # timeHeldCalculated usually comes in ms. Convert to Minutes.
    if 'timeHeldCalculated' in df_final.columns:
        df_final['minutes_owned'] = df_final['timeHeldCalculated'] / (1000 * 60)

    # D. Server ID
    if 'server_id' in df_final.columns:
        server_dummies = pd.get_dummies(df_final['server_id'], prefix='server')
        df_final = pd.concat([df_final, server_dummies], axis=1)

    print("[+] Feature Engineering complete.")
    display(df_final[['species', 'iv_percentage', 'is_shiny', 'minutes_owned']].head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_final.empty:
    # Graph 1: Quality of Released Pokemon
    # If the curve leans to the left (0%), players are optimizing their storage.
    if 'iv_percentage' in df_final.columns:
        plt.figure(figsize=(8, 5))
        sns.histplot(df_final['iv_percentage'], bins=20, kde=True, color='brown')
        plt.title('Distribution of Released Pokémon IVs')
        plt.xlabel('IV Percentage')
        plt.show()

    # Graph 2: Top Rejected Species
    plt.figure(figsize=(10, 6))
    top_rejected = df_final['species'].value_counts().head(10).index
    sns.countplot(
        y='species',
        data=df_final[df_final['species'].isin(top_rejected)],
        order=top_rejected,
        palette='Reds_d',
        hue='species',
        legend=False
    )
    plt.title('Most Released Pokémon (The "Trash" List)')
    plt.show()

In [None]:
# SAVE CLEAN DATASET

if not df_final.empty:
    cols_to_keep = [
        'species',
        'level',
        'is_shiny',
        'iv_percentage',
        'minutes_owned',
        'world',
        'biome'
    ]
    # Add IV columns for detail
    cols_to_keep.extend([col for col in df_final.columns if col.startswith('ivs.')])

    # Add server columns
    cols_to_keep.extend([col for col in df_final.columns if 'server_' in col])

    df_export = df_final[cols_to_keep].copy()

    df_export.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean released dataset saved to: {CLEAN_DATA_PATH}")
    display(df_export.head())