In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# CONFIG
LOGIN_PATH = "../data/raw/dataset_SESSION_LOGIN_raw.csv"
LOGOUT_PATH = "../data/raw/dataset_SESSION_LOGOUT_raw.csv"
CLEAN_DATA_PATH = "../data/dataset_sessions_lifecycle_clean.csv"

print("[+] Configuration loaded.")

In [None]:
# LOAD AND COMBINE DATA
def load_data(path, event_type):
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['event_type'] = event_type
        return df
    return pd.DataFrame()

df_login = load_data(LOGIN_PATH, 'LOGIN')
df_logout = load_data(LOGOUT_PATH, 'LOGOUT')

# Combine both into a single timeline
df_combined = pd.concat([df_login, df_logout], ignore_index=True)

if not df_combined.empty:
    print(f"[+] Loaded {len(df_login)} logins and {len(df_logout)} logouts.")

    # FIX: Remove 'unit=ms' because data is already a String (ISO Format)
    if 'timestamp' in df_combined.columns:
        df_combined['dt'] = pd.to_datetime(df_combined['timestamp'])

    # Sort by Player and Time (Crucial for matching)
    df_combined = df_combined.sort_values(by=['player_uuid', 'dt'])

    display(df_combined[['player_uuid', 'event_type', 'dt']].head(6))
else:
    print("[-] No session data found. Run the updated downloader first.")

In [None]:
# CALCULATE SESSION DURATION (The Logic)
# We need to find pairs: LOGIN -> LOGOUT

session_list = []

if not df_combined.empty:
    # Group by player
    for player, group in df_combined.groupby('player_uuid'):
        group = group.sort_values('dt')

        login_dt = None

        for index, row in group.iterrows():
            if row['event_type'] == 'LOGIN':
                # Save date/time of login (Datetime Object)
                login_dt = row['dt']

            elif row['event_type'] == 'LOGOUT' and login_dt is not None:
                # Exit hour
                logout_dt = row['dt']

                # Deduct login from logout to get session duration
                duration = logout_dt - login_dt

                # Parse difference to total minutes
                duration_minutes = duration.total_seconds() / 60

                # Anti-Bug filters (Sessions between 0 minutes and 24 hours)
                if 0 < duration_minutes < 1440:
                    session_list.append({
                        'player_uuid': player,
                        'login_ts': login_dt,
                        'logout_ts': logout_dt,
                        'duration_minutes': duration_minutes,
                        'hour_of_day': login_dt.hour,
                        'day_of_week': login_dt.day_name()
                    })

                # Reset to search for the next session
                login_dt = None

    df_sessions = pd.DataFrame(session_list)
    print(f"[+] Extracted {len(df_sessions)} valid sessions.")
    display(df_sessions.head())

In [None]:
# DATA ANALYSIS & VISUALIZATION

if not df_sessions.empty:
    # Graph 1: Average Playtime Distribution
    plt.figure(figsize=(8, 5))
    sns.histplot(df_sessions['duration_minutes'], bins=30, kde=True, color='teal')
    plt.title('Player Session Duration (Minutes)')
    plt.xlim(0, 180)  # Limit to 3 hours for better view
    plt.axvline(df_sessions['duration_minutes'].mean(), color='r', linestyle='--', label='Average')
    plt.legend()
    plt.show()

    # Graph 2: Peak Hours (Heatmap)
    # Pivot table to count logins per Day and Hour
    pivot = df_sessions.pivot_table(index='day_of_week', columns='hour_of_day', values='player_uuid', aggfunc='count',
                                    fill_value=0)

    # Reorder days
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    pivot = pivot.reindex(days_order)

    plt.figure(figsize=(10, 5))
    sns.heatmap(pivot, cmap='YlOrRd', annot=True, fmt='g')
    plt.title('Player Activity Heatmap (Logins)')
    plt.show()

In [None]:
# SAVE CLEAN DATASET

if not df_sessions.empty:
    df_sessions.to_csv(CLEAN_DATA_PATH, index=False)
    print(f"[+] Clean session lifecycle dataset saved to: {CLEAN_DATA_PATH}")
    display(df_sessions.head())