In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
RAW_DATA_PATH = "../data/raw/dataset_SESSION_SNAPSHOT_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_snapshots_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD DATA
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"[+] Loaded {len(df)} snapshots.")
else:
    print(f"[-] File not found: {RAW_DATA_PATH}. Run download_data.ipynb first.")
    df = pd.DataFrame()

if not df.empty:
    # JSON EXPANSION
    def clean_json(x):
        if isinstance(x, dict): return x
        try:
            return json.loads(x)
        except:
            return {}


    df['context_data'] = df['context_data'].apply(clean_json)
    df_context = pd.json_normalize(df['context_data'])

    # Remove duplicated columns (world, timestamp, playerUuid)
    cols_to_drop = df_context.columns.intersection(df.columns)
    if not cols_to_drop.empty:
        df_context = df_context.drop(columns=cols_to_drop)

    # Join
    df_final = df.join(df_context).drop(columns=['context_data'])

    print("[+] JSON expanded successfully.")
    display(df_final.head(3))

In [None]:
# FEATURE ENGINEERING (Player Behavior)

if not df_final.empty:
    # A. Convert CM to KM (Human readable)
    # Minecraft tracks distance in cm. 100,000 cm = 1 km.
    dist_cols = ['totalWalkedCm', 'totalSprintedCm', 'totalFlownCm', 'totalDistanceCm']

    for col in dist_cols:
        if col in df_final.columns:
            new_col_name = col.replace('Cm', '_km')
            df_final[new_col_name] = df_final[col] / 100000


    # B. Exploration Diversity (How many biomes did they visit recently?)
    # recentBiomes comes as a string list "['plains', 'desert']" or a list object
    def count_biomes(x):
        try:
            # If it's a string representation of a list, safe eval it
            if isinstance(x, str):
                biomes = ast.literal_eval(x)
            else:
                biomes = x
            return len(set(biomes))  # Unique biomes
        except:
            return 0


    if 'recentBiomes' in df_final.columns:
        print("[INFO] calculating biome diversity...")
        df_final['biomes_visited_count'] = df_final['recentBiomes'].apply(count_biomes)

    # C. Playstyle Ratios
    # Fly Ratio: How much of their movement is flying?
    if 'totalDistance_km' in df_final.columns and 'totalFlown_km' in df_final.columns:
        # Avoid division by zero
        df_final['fly_ratio'] = df_final.apply(
            lambda row: row['totalFlown_km'] / row['totalDistance_km'] if row['totalDistance_km'] > 0 else 0,
            axis=1
        )

    # D. Server ID (One-Hot)
    if 'server_id' in df_final.columns:
        server_dummies = pd.get_dummies(df_final['server_id'], prefix='server')
        df_final = pd.concat([df_final, server_dummies], axis=1)

    print("[+] Feature Engineering complete.")
    display(df_final[['totalDistance_km', 'fly_ratio', 'biomes_visited_count', 'world']].head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_final.empty:
    # Graph 1: Movement Type Distribution
    # Are players walking or flying more?
    if 'totalWalked_km' in df_final.columns and 'totalFlown_km' in df_final.columns:
        avg_stats = df_final[['totalWalked_km', 'totalSprinted_km', 'totalFlown_km']].mean()

        plt.figure(figsize=(6, 6))
        plt.pie(avg_stats, labels=avg_stats.index, autopct='%1.1f%%', colors=['#ff9999', '#66b3ff', '#99ff99'])
        plt.title('Average Movement Distribution (Server-wide)')
        plt.show()

    # Graph 2: Exploration vs World
    # Do players explore more in the Overworld or Resource world?
    if 'world' in df_final.columns and 'biomes_visited_count' in df_final.columns:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x='world', y='biomes_visited_count', data=df_final, palette='Set3', hue='world', legend=False)
        plt.title('Biome Exploration per World')
        plt.xticks(rotation=45)
        plt.show()

In [None]:
# SAVE CLEAN DATASET

if not df_final.empty:
    cols_to_keep = [
        'totalDistance_km',
        'totalWalked_km',
        'totalFlown_km',
        'fly_ratio',
        'biomes_visited_count',
        'world'
    ]
    # Add server columns
    cols_to_keep.extend([col for col in df_final.columns if 'server_' in col])

    df_export = df_final[cols_to_keep].copy()

    df_export.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean snapshot dataset saved to: {CLEAN_DATA_PATH}")
    display(df_export.head())