In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
RAW_DATA_PATH = "../data/raw/dataset_GTS_TRANSACTION_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_economy_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD DATA
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"[+] Loaded {len(df)} transactions.")
else:
    print(f"[-] File not found: {RAW_DATA_PATH}. Run download_data.ipynb first.")
    df = pd.DataFrame()

if not df.empty:
    # JSON EXPANSION
    def clean_json(x):
        if isinstance(x, dict): return x
        try:
            return json.loads(x)
        except:
            return {}


    df['context_data'] = df['context_data'].apply(clean_json)
    df_context = pd.json_normalize(df['context_data'])

    # Remove duplicated columns
    cols_to_drop = df_context.columns.intersection(df.columns)
    if not cols_to_drop.empty:
        df_context = df_context.drop(columns=cols_to_drop)

    # Join
    df_final = df.join(df_context).drop(columns=['context_data'])

    print("[+] JSON expanded successfully.")
    display(df_final.head(3))

In [None]:
# FEATURE ENGINEERING (Economy Logic)

if not df_final.empty:
    # A. Parse Description (Pokemon vs Items)
    # Examples: "Honedge Lvl1", "Diamond", "Iron Ingot x64" (Hypothetical)

    def parse_description(row):
        desc = str(row.get('description', ''))
        itype = row.get('itemType', 'ITEM')

        product_name = desc
        level = 0
        quantity = 1

        if itype == 'POKEMON':
            # Regex for "Species LvlX"
            match = re.search(r'(.+)\s+Lvl(\d+)', desc)
            if match:
                product_name = match.group(1).strip()
                level = int(match.group(2))
        else:
            # Logic for items (if format is "Item x64")
            # Actually these are disabled
            # This depends on how GTS formats items, assuming simple name for now
            pass

        return pd.Series([product_name, level, quantity])


    print("[INFO] Parsing product descriptions...")
    df_final[['product_name', 'level', 'quantity']] = df_final.apply(parse_description, axis=1)

    # B. Time to Sell (Hours)
    # listingDurationMs tells us how long it sat on the market
    if 'listingDurationMs' in df_final.columns:
        df_final['hours_on_market'] = df_final['listingDurationMs'] / (1000 * 60 * 60)

    # C. Server ID (One-Hot)
    if 'server_id' in df_final.columns:
        server_dummies = pd.get_dummies(df_final['server_id'], prefix='server')
        df_final = pd.concat([df_final, server_dummies], axis=1)

    print("[+] Feature Engineering complete.")
    display(df_final[['itemType', 'product_name', 'level', 'price', 'hours_on_market']].head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_final.empty:
    # Filter only Pokemon for clearer graphs
    pokemon_sales = df_final[df_final['itemType'] == 'POKEMON']

    if not pokemon_sales.empty:
        # Graph 1: Top 10 Most Expensive Species (Average)
        avg_prices = pokemon_sales.groupby('product_name')['price'].mean().sort_values(ascending=False).head(10)

        plt.figure(figsize=(10, 5))
        sns.barplot(x=avg_prices.values, y=avg_prices.index, palette='viridis', hue=avg_prices.index, legend=False)
        plt.title('Top 10 Most Expensive Pok√©mon (Avg Price)')
        plt.xlabel('Price')
        plt.show()

        # Graph 2: Price vs Level (Scatter plot)
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x='level', y='price', data=pokemon_sales, hue='product_name', legend=False)
        plt.title('Price Correlation with Level')
        plt.show()

    # Graph 3: Sales Volume by Server
    plt.figure(figsize=(6, 4))
    sns.countplot(x='server_id', data=df_final, palette='magma', hue='server_id', legend=False)
    plt.title('Transactions Volume by Server')
    plt.show()

In [None]:
# SAVE CLEAN DATASET

if not df_final.empty:
    cols_to_keep = [
        'itemType',
        'product_name',
        'level',
        'price',
        'hours_on_market',
        'sellerUuid',
        'buyerUuid',
        'world',
        'biome'
    ]
    # Add server columns
    cols_to_keep.extend([col for col in df_final.columns if 'server_' in col])

    df_export = df_final[cols_to_keep].copy()

    df_export.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean economy dataset saved to: {CLEAN_DATA_PATH}")
    display(df_export.head())