In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
RAW_DATA_PATH = "../data/raw/dataset_PLAYER_DEATH_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_deaths_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD DATA
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"[+] Loaded {len(df)} death events.")
else:
    print(f"[-] File not found: {RAW_DATA_PATH}. Run download_data.ipynb first.")
    df = pd.DataFrame()

if not df.empty:
    # 2. JSON EXPANSION
    def clean_json(x):
        if isinstance(x, dict): return x
        try:
            return json.loads(x)
        except:
            return {}


    df['context_data'] = df['context_data'].apply(clean_json)
    df_context = pd.json_normalize(df['context_data'])

    # Remove duplicated columns (world, biome, playerUuid, timestamp)
    cols_to_drop = df_context.columns.intersection(df.columns)
    if not cols_to_drop.empty:
        df_context = df_context.drop(columns=cols_to_drop)

    # Join
    df_final = df.join(df_context).drop(columns=['context_data'])

    print("[+] JSON expanded successfully.")
    display(df_final.head(3))

In [None]:
# FEATURE ENGINEERING (Death Analysis)

if not df_final.empty:
    # A. Clean Death Cause
    # Sometimes causes come as "death.attack.mob", sometimes "lava".
    # We want to group them roughly.
    def categorize_death(cause):
        cause = str(cause).lower()
        if 'fall' in cause or 'kinetic' in cause: return 'Gravity'
        if 'mob' in cause or 'monster' in cause or 'arrow' in cause: return 'PvE'
        if 'player' in cause: return 'PvP'
        if 'lava' in cause or 'fire' in cause or 'burn' in cause: return 'Fire'
        if 'drown' in cause: return 'Drowning'
        if 'starve' in cause: return 'Hunger'
        if 'void' in cause: return 'Void'
        if 'magic' in cause: return 'Magic'
        return 'Other'


    if 'cause' in df_final.columns:
        print("[INFO] Categorizing death causes...")
        df_final['death_category'] = df_final['cause'].apply(categorize_death)

    # B. Server ID (One-Hot)
    if 'server_id' in df_final.columns:
        server_dummies = pd.get_dummies(df_final['server_id'], prefix='server')
        df_final = pd.concat([df_final, server_dummies], axis=1)

    # C. High Level Death? (Flag)
    # Checks if the player lost a lot of XP (Level > 30)
    if 'level' in df_final.columns:
        df_final['is_high_level'] = df_final['level'] > 30

    print("[+] Feature Engineering complete.")
    display(df_final[['cause', 'death_category', 'level', 'is_high_level', 'biome']].head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_final.empty:
    # Graph 1: Top Causes of Death
    plt.figure(figsize=(8, 5))
    sns.countplot(x='death_category', data=df_final, palette='Reds_r', hue='death_category', legend=False)
    plt.title('Main Causes of Death')
    plt.show()

    # Graph 2: Deadliest Biomes (Top 5)
    plt.figure(figsize=(10, 5))
    top_biomes = df_final['biome'].value_counts().head(5).index
    sns.countplot(y='biome', data=df_final[df_final['biome'].isin(top_biomes)], palette='dark:salmon', hue='biome',
                  legend=False)
    plt.title('Top 5 Deadliest Biomes')
    plt.show()

    # Graph 3: Level distribution
    if 'level' in df_final.columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(df_final['level'], bins=20, kde=True, color='red')
        plt.title('Player Level at Time of Death')
        plt.xlabel('XP Level')
        plt.show()

In [None]:
# SAVE CLEAN DATASET

if not df_final.empty:
    cols_to_keep = [
        'cause',
        'death_category',
        'level',
        'world',
        'biome',
        'is_high_level'
    ]
    # Add server columns
    cols_to_keep.extend([col for col in df_final.columns if 'server_' in col])

    df_export = df_final[cols_to_keep].copy()

    df_export.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean deaths dataset saved to: {CLEAN_DATA_PATH}")
    display(df_export.head())