NOTEBOOK SUMMARY

The first 3 blocks of code show the preprocessing and model running used for the final submissions.
Subsequent code shows discarded approaches, whicha are touched upon in the report

LIBRARY LOADING

This script loads all required libraries 
It sets up file paths for the dataset, then loads both the training and test .jsonl files line by line into Python lists.
After loading, it reports how many battles were successfully read.
Errors during loading are caught and displayed.

In [None]:
# LIBRARY LOADING
import os
import json
import warnings

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Sklearn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
    VotingClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline

# XGBoost & LightGBM
import xgboost as xgb
import lightgbm as lgb

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
tf.random.set_seed(42)

warnings.filterwarnings("ignore")

# --- Define the path to our data ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')
printing = False
train_data = []
test_data = []

try:
    print(f"Loading data from '{train_file_path}'...")
    with open(train_file_path, 'r') as f:
        for line in f:
            train_data.append(json.loads(line))
    
    print(f"Loading data from '{test_file_path}'...")
    with open(test_file_path, 'r') as f:
        for line in f:
            test_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.\n")
    if printing: 
        # Inspect the first few battles
        for i, battle in enumerate(train_data[:5]):  # first 3 battles
            print(f"--- Battle {i+1} ---")
            print("Keys:", list(battle.keys()))
            
            # Show top-level values
            for key, value in battle.items():
                if key == 'battle_timeline':
                    print(f"{key}:")
                    for turn in value[:3]:  # show first 3 turns
                        print("   ", turn)
                    if len(value) > 3:
                        print("   ... (more turns)")
                else:
                    print(f"{key}: {value}")
            print("\n")
        
except Exception as e:
    print("Error loading data:", e)


Loading data from 'C:\Users\ivayl\Documents\GitHub\FDS-Pok-mon-Battle\fds-pokemon-battles-prediction-2025\train.jsonl'...
Loading data from 'C:\Users\ivayl\Documents\GitHub\FDS-Pok-mon-Battle\fds-pokemon-battles-prediction-2025\test.jsonl'...
Successfully loaded 10000 battles.



DATA PREPROCESSING EXPLANATION

This function takes raw battle dictionaries and converts them into a structured DataFrame.
It processes every battle one at a time, and loops through all turns inside each battle.
It extracts turn-level information such as Pokémon names, HP percentages, statuses, boosts, and move details.
It also attaches the overall battle outcome when processing training data.
Each turn becomes a single row in the output table, and all these rows are gathered into a list.
After processing all battles, the function converts the list into a Pandas DataFrame.
Several columns containing categorical values, such as Pokémon names and move types, are one-hot encoded.
The function returns this fully encoded DataFrame.

The script then calls the function twice.
The first call processes the training dataset and prints the resulting shape.

In [11]:
def process_battle_data(data, is_train=True):
    all_rows = []
    for battle in tqdm(data, desc="Processing battles"):
        battle_id = battle['battle_id']
        player_won = int(battle['player_won']) if is_train else None
        
        for turn in battle['battle_timeline']:
            turn_num = turn['turn']
            
            p1_state = turn['p1_pokemon_state']
            p2_state = turn['p2_pokemon_state']
            p1_move = turn.get('p1_move_details') or {}
            p2_move = turn.get('p2_move_details') or {}

            # === Construct turn-level record ===
            row = {
                'battle_id': battle_id,
                'turn': turn_num,

                # Pokémon identity
                'p1_pokemon_name': p1_state.get('name', None),
                'p2_pokemon_name': p2_state.get('name', None),

                # HP, status, and effects
                'p1_hp_pct': p1_state.get('hp_pct', np.nan),
                'p2_hp_pct': p2_state.get('hp_pct', np.nan),
                'p1_status': p1_state.get('status', 'nostatus'),
                'p2_status': p2_state.get('status', 'nostatus'),

                # Moves
                'p1_move_name': p1_move.get('name', None),
                'p1_move_type': p1_move.get('type', None),
                'p1_move_power': p1_move.get('base_power', 0),

                'p2_move_name': p2_move.get('name', None),
                'p2_move_type': p2_move.get('type', None),
                'p2_move_power': p2_move.get('base_power', 0),

                # Boosts
                'p1_boost_atk': p1_state.get('boosts', {}).get('atk', 0),
                'p1_boost_def': p1_state.get('boosts', {}).get('def', 0),
                'p1_boost_spa': p1_state.get('boosts', {}).get('spa', 0),
                'p1_boost_spd': p1_state.get('boosts', {}).get('spd', 0),
                'p1_boost_spe': p1_state.get('boosts', {}).get('spe', 0),

                'p2_boost_atk': p2_state.get('boosts', {}).get('atk', 0),
                'p2_boost_def': p2_state.get('boosts', {}).get('def', 0),
                'p2_boost_spa': p2_state.get('boosts', {}).get('spa', 0),
                'p2_boost_spd': p2_state.get('boosts', {}).get('spd', 0),
                'p2_boost_spe': p2_state.get('boosts', {}).get('spe', 0),
            }

            if is_train:
                row['player_won'] = player_won

            all_rows.append(row)

    # === Create DataFrame ===
    df = pd.DataFrame(all_rows)

    # === One-hot encode categorical features ===
    categorical_cols = [
        'p1_pokemon_name', 'p2_pokemon_name',
        'p1_status', 'p2_status',
        'p1_move_type', 'p2_move_type',
        'p1_move_name', 'p2_move_name'
    ]
    df = pd.get_dummies(df, columns=categorical_cols, dummy_na=True)

    return df


train_df = process_battle_data(train_data, is_train=True)
print("Train DataFrame shape:", train_df.shape)

test_df = process_battle_data(test_data, is_train=False)
print("Test DataFrame shape:", test_df.shape)


Processing battles:   0%|          | 0/10000 [00:00<?, ?it/s]

Train DataFrame shape: (300000, 187)


Processing battles:   0%|          | 0/5000 [00:00<?, ?it/s]

Test DataFrame shape: (150000, 186)


In [12]:
# =====================================================================
# === BUILD POKEMON DATABASE FROM TRAIN DATA ==========================
# =====================================================================

def build_pokemon_database(battles_data):
    all_rows = []
    for battle in battles_data:
        battle_id = battle['battle_id']
        p1_team = battle.get('p1_team_details', [])
        p2_team = battle.get('p2_team_details', [])
        
        for team, player in zip([p1_team, p2_team], ['p1','p2']):
            for pokemon in team:
                row = {
                    'battle_id': battle_id,
                    f'{player}_pokemon_name': pokemon.get('name'),
                    f'{player}_level': pokemon.get('level'),
                    f'{player}_base_hp': pokemon.get('base_hp'),
                    f'{player}_base_atk': pokemon.get('base_atk'),
                    f'{player}_base_def': pokemon.get('base_def'),
                    f'{player}_base_spa': pokemon.get('base_spa'),
                    f'{player}_base_spd': pokemon.get('base_spd'),
                    f'{player}_base_spe': pokemon.get('base_spe'),
                    f'{player}_type1': pokemon.get('types')[0] if len(pokemon.get('types', []))>0 else None,
                    f'{player}_type2': pokemon.get('types')[1] if len(pokemon.get('types', []))>1 else None
                }
                all_rows.append(row)
    return pd.DataFrame(all_rows)

# =====================================================================
# === MERGE P2 TEAM STATS INTO MAIN TRAIN DF ==========================
# =====================================================================

pokemon_db = build_pokemon_database(train_data)

p2_cols = [c for c in pokemon_db.columns if c.startswith('p2')]
p2_stats_df = pokemon_db[['battle_id'] + p2_cols].groupby('battle_id').mean().reset_index()

train_df_extended = train_df.merge(p2_stats_df, on='battle_id', how='left')

# =====================================================================
# === FEATURE SELECTION ================================================
# =====================================================================

feature_cols = [c for c in train_df_extended.columns if c not in ['battle_id', 'turn', 'player_won']]

# =====================================================================
# === SCALE FEATURES ===================================================
# =====================================================================

X = train_df_extended[feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y = train_df_extended.groupby('battle_id')['player_won'].first().values

# =====================================================================
# === BATTLE-LEVEL FEATURE ENGINEERING ================================
# =====================================================================

def build_chunk_features(df, feature_cols):
    out = []
    for bid in df['battle_id'].unique():
        battle = df[df['battle_id'] == bid]
        feats = []
        for (start, end) in [(1,10),(11,20),(21,30)]:
            subset = battle[(battle['turn']>=start)&(battle['turn']<=end)]
            feats.extend(subset[feature_cols].mean().values if not subset.empty else [0.0]*len(feature_cols))
        out.append(feats)
    return np.array(out)

def build_partial_features(df, hp_cols, atk_cols, def_cols):
    out = []
    intervals = [(1,10),(11,20),(21,25),(26,30)]
    groups = [hp_cols, atk_cols, def_cols]

    for bid in df['battle_id'].unique():
        battle = df[df['battle_id'] == bid]
        feats = []
        for cols in groups:
            for (start,end) in intervals:
                subset = battle[(battle['turn']>=start)&(battle['turn']<=end)]
                feats.extend(subset[cols].mean().values if not subset.empty else [0.0]*len(cols))
        out.append(feats)
    return np.array(out)

hp_cols  = [c for c in feature_cols if 'hp'  in c.lower()]
atk_cols = [c for c in feature_cols if 'atk' in c.lower()]
def_cols = [c for c in feature_cols if 'def' in c.lower()]

X_chunk   = build_chunk_features(train_df_extended, feature_cols)
X_partial = build_partial_features(train_df_extended, hp_cols, atk_cols, def_cols)
X_combined = np.concatenate([X_chunk, X_partial], axis=1)

# =====================================================================
# === TRAIN/VAL SPLIT ==================================================
# =====================================================================

X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

# =====================================================================
# === MODELS ===========================================================
# =====================================================================

log_reg = LogisticRegression(max_iter=5000)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05
)

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    tree_method="hist",
    eval_metric="logloss"
)

lgbm_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05
)

all_models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model
}

# =====================================================================
# === TRAIN & EVALUATE ================================================
# =====================================================================

results = []

for name, model in all_models.items():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:,1] if hasattr(model,"predict_proba") else y_val_pred
    
    val_acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    auc = roc_auc_score(y_val, y_val_prob)
    
    results.append({
        "Model": name,
        "Train Accuracy": train_acc,
        "Val Accuracy": val_acc,
        "F1-Score": f1,
        "ROC-AUC": auc
    })
    
    print(f"{name} — Train Acc: {train_acc:.3f}, Val Acc: {val_acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")

results_df = pd.DataFrame(results)
print("\n=== Model Performance Summary ===")
print(results_df)


Logistic Regression — Train Acc: 0.841, Val Acc: 0.842, F1: 0.843, AUC: 0.894
Random Forest — Train Acc: 0.911, Val Acc: 0.815, F1: 0.818, AUC: 0.877
Gradient Boosting — Train Acc: 0.860, Val Acc: 0.828, F1: 0.830, AUC: 0.896
XGBoost — Train Acc: 1.000, Val Acc: 0.833, F1: 0.834, AUC: 0.895
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10076
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 542
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498500 -> initscore=-0.006000
[LightGBM] [Info] Start training from score -0.006000
LightGBM — Train Acc: 0.981, Val Acc: 0.829, F1: 0.831, AUC: 0.898

=== Model Performance Summary ===
                 Model  Train Accuracy  Val Accuracy  F1-Score   ROC-AUC
0  Logistic Regression        0.841000  

FINAL PIPELINE SUMMARY

This code builds the full training and inference pipeline for predicting battle outcomes.
It extracts Pokémon base stats and types from each battle and merges them into the main dataset.
It identifies all usable features and creates additional engineered features by averaging turn-level values across multiple battle segments.
These chunk-based and partial-segment features are combined into one final feature matrix.
The data is split into training and validation sets.
Several models are built, including Random Forests, Gradient Boosting, XGBoost, and LightGBM.
Multiple voting ensembles are constructed and trained on the engineered features.
The same processing pipeline is applied to the test set.
Each ensemble outputs a separate CSV file containing the final predictions.

The final model settings were chosen after grid-search experimentation and repeated back-and-forth testing of different feature-processing strategies.

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
import numpy as np

def build_pokemon_database(battles_data):
    all_rows = []
    for battle in battles_data:
        battle_id = battle['battle_id']
        p1_team = battle.get('p1_team_details', [])
        p2_team = battle.get('p2_team_details', [])
        for team, player in zip([p1_team, p2_team], ['p1','p2']):
            for pokemon in team:
                row = {
                    'battle_id': battle_id,
                    f'{player}_pokemon_name': pokemon.get('name'),
                    f'{player}_level': pokemon.get('level'),
                    f'{player}_base_hp': pokemon.get('base_hp'),
                    f'{player}_base_atk': pokemon.get('base_atk'),
                    f'{player}_base_def': pokemon.get('base_def'),
                    f'{player}_base_spa': pokemon.get('base_spa'),
                    f'{player}_base_spd': pokemon.get('base_spd'),
                    f'{player}_base_spe': pokemon.get('base_spe'),
                    f'{player}_type1': pokemon.get('types')[0] if len(pokemon.get('types', []))>0 else None,
                    f'{player}_type2': pokemon.get('types')[1] if len(pokemon.get('types', []))>1 else None
                }
                all_rows.append(row)
    return pd.DataFrame(all_rows)

pokemon_db = build_pokemon_database(train_data)
p2_cols = [c for c in pokemon_db.columns if c.startswith('p2')]
p2_stats_df = pokemon_db[['battle_id'] + p2_cols].groupby('battle_id').mean().reset_index()
train_df_extended = train_df.merge(p2_stats_df, on='battle_id', how='left')

feature_cols = [c for c in train_df_extended.columns if c not in ['battle_id', 'turn', 'player_won']]
X = train_df_extended[feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = train_df_extended.groupby('battle_id')['player_won'].first().values

def build_chunk_features(df, feature_cols):
    out = []
    for bid in df['battle_id'].unique():
        battle = df[df['battle_id'] == bid]
        feats = []
        for (start, end) in [(1,10),(11,20),(21,30)]:
            subset = battle[(battle['turn']>=start)&(battle['turn']<=end)]
            feats.extend(subset[feature_cols].mean().values if not subset.empty else [0.0]*len(feature_cols))
        out.append(feats)
    return np.array(out)

def build_partial_features(df, hp_cols, atk_cols, def_cols):
    out = []
    intervals = [(1,10),(11,20),(21,25),(26,30)]
    groups = [hp_cols, atk_cols, def_cols]
    for bid in df['battle_id'].unique():
        battle = df[df['battle_id'] == bid]
        feats = []
        for cols in groups:
            for (start,end) in intervals:
                subset = battle[(battle['turn']>=start)&(battle['turn']<=end)]
                feats.extend(subset[cols].mean().values if not subset.empty else [0.0]*len(cols))
        out.append(feats)
    return np.array(out)

hp_cols  = [c for c in feature_cols if 'hp'  in c.lower()]
atk_cols = [c for c in feature_cols if 'atk' in c.lower()]
def_cols = [c for c in feature_cols if 'def' in c.lower()]

X_chunk   = build_chunk_features(train_df_extended, feature_cols)
X_partial = build_partial_features(train_df_extended, hp_cols, atk_cols, def_cols)
X_combined = np.concatenate([X_chunk, X_partial], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define all models including simple ones
log_reg = LogisticRegression(max_iter=5000)
rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)
gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05)
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.05, tree_method="hist", eval_metric="logloss")
lgbm_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05)

ensemble_soft = VotingClassifier([("rf", rf), ("gb", gb), ("xgb", xgb_model), ("lgb", lgbm_model)], voting="soft")
ensemble_hard = VotingClassifier([("rf", rf), ("gb", gb), ("xgb", xgb_model), ("lgb", lgbm_model)], voting="hard")
ensemble_rf_xgb = VotingClassifier([("rf", rf), ("xgb", xgb_model)], voting="soft")
ensemble_gb_lgb = VotingClassifier([("gb", gb), ("lgb", lgbm_model)], voting="soft")

all_models = {
    "Logistic_Regression": log_reg,
    "Random_Forest": rf,
    "Gradient_Boosting": gb,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model,
    "Ensemble_Soft": ensemble_soft,
    "Ensemble_Hard": ensemble_hard,
    "Ensemble_RF_XGB": ensemble_rf_xgb,
    "Ensemble_GB_LGB": ensemble_gb_lgb
}

# Train all models and print train/val accuracy
for name, model in all_models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} — Train Acc: {train_acc:.3f}, Val Acc: {val_acc:.3f}")

# Prepare test set
pokemon_db_test = build_pokemon_database(test_data)
p2_cols_test = [c for c in pokemon_db_test.columns if c.startswith('p2')]
p2_stats_test = pokemon_db_test[['battle_id'] + p2_cols_test].groupby('battle_id').mean().reset_index()
test_df_extended = test_df.merge(p2_stats_test, on='battle_id', how='left')

X_chunk_test = build_chunk_features(test_df_extended, feature_cols)
X_partial_test = build_partial_features(test_df_extended, hp_cols, atk_cols, def_cols)
X_test_combined = np.concatenate([X_chunk_test, X_partial_test], axis=1)

# Save predictions for all models
for name, model in all_models.items():
    preds = model.predict(X_test_combined)
    out_df = pd.DataFrame({
        "battle_id": test_df_extended['battle_id'].unique(),
        "prediction": preds
    })
    out_df.to_csv(f"{name}_FINAL.csv", index=False)


Logistic_Regression — Train Acc: 0.841, Val Acc: 0.842
Random_Forest — Train Acc: 0.911, Val Acc: 0.815
Gradient_Boosting — Train Acc: 0.860, Val Acc: 0.829
XGBoost — Train Acc: 1.000, Val Acc: 0.833
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10076
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 542
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498500 -> initscore=-0.006000
[LightGBM] [Info] Start training from score -0.006000
LightGBM — Train Acc: 0.981, Val Acc: 0.829
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100

GRID SEARCH (SETUP FOR LOGISTIC REGRESSION)

In [None]:

param_grid = [
    {"penalty": "l2", "solver": "lbfgs", "C": C, "class_weight": cw}
    for C in [0.01, 0.1, 1, 3, 10]
    for cw in [None, "balanced"]
] + [
    {"penalty": "l1", "solver": "liblinear", "C": C, "class_weight": None}
    for C in [0.1, 1, 3]
]

results = []

for params in param_grid:
    model = LogisticRegression(
        C=params["C"],
        penalty=params["penalty"],
        solver=params["solver"],
        class_weight=params["class_weight"],
        max_iter=5000
    )
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred   = model.predict(X_val)

    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc   = accuracy_score(y_val, y_val_pred)

    print(f"TRY → {params} | Train Acc={train_acc:.3f}, Val Acc={val_acc:.3f}")

    results.append({
        "params": params,
        "val_acc": val_acc,
        "train_acc": train_acc,
        "model": model
    })

# Pick best 3 models
results_sorted = sorted(results, key=lambda x: x["val_acc"], reverse=True)
top3 = results_sorted[:3]

print("\n=== TOP 3 MODELS ===")
for r in top3:
    print(r["params"], " | Val Acc =", r["val_acc"])

# ============================================================
# === 3) RETRAIN TOP 3 ON FULL TRAIN DATA =====================
# ============================================================

trained_models = []
for idx, r in enumerate(top3):
    p = r["params"]
    print(f"\nRetraining model #{idx+1} on FULL TRAIN: {p}")

    m = LogisticRegression(
        C=p["C"],
        penalty=p["penalty"],
        solver=p["solver"],
        class_weight=p["class_weight"],
        max_iter=5000
    )
    m.fit(X_combined, y)
    trained_models.append((idx+1, m, p))

# ============================================================
# === 4) PREPROCESS TEST DATA SAME WAY ========================
# ============================================================

# Must match your existing preprocessing exactly.
# Assuming you already have: test_data, process_battle_data(), etc.

test_df = process_battle_data(test_data, is_train=False)

test_df = test_df.sort_values(["battle_id","turn"])
test_df_extended = test_df.merge(p2_stats_df, on="battle_id", how="left")

X_chunk_test   = build_chunk_features(test_df_extended, feature_cols)
X_partial_test = build_partial_features(test_df_extended, hp_cols, atk_cols, def_cols)
X_test_combined = np.concatenate([X_chunk_test, X_partial_test], axis=1)

# ============================================================
# === 5) RUN TOP 3 MODELS ON TEST & SAVE CSVs ================
# ============================================================

import pandas as pd

for rank, model, params in trained_models:
    preds = model.predict(X_test_combined)

    out_path = f"logreg_best{rank}.csv"
    pd.DataFrame({
        "battle_id": test_df_extended["battle_id"].unique(),
        "predicted_win": preds
    }).to_csv(out_path, index=False)

    print(f"Saved {out_path} | Params: {params}")


EXTRACTING TEAM 2'S POKEMON STATS

Done using a created database of Pokemon stats (from P1's team stats), and artificially creating a team 2 stats, using the pokemons used in the first 30 moves.
Surprisingly didn't improve performance and therefore was not used in final model.

In [None]:
def build_pokemon_database(battles_data):
    all_rows = []

    for battle in tqdm(battles_data, desc="Building Pokémon database"):
        battle_id = battle['battle_id']
        p1_team = battle.get('p1_team_details', [])

        for pokemon in p1_team:
            row = {
                'battle_id': battle_id,
                'pokemon_name': pokemon.get('name'),
                'level': pokemon.get('level'),
                'base_hp': pokemon.get('base_hp'),
                'base_atk': pokemon.get('base_atk'),
                'base_def': pokemon.get('base_def'),
                'base_spa': pokemon.get('base_spa'),
                'base_spd': pokemon.get('base_spd'),
                'base_spe': pokemon.get('base_spe'),
                'type1': pokemon.get('types')[0] if len(pokemon.get('types', [])) > 0 else None,
                'type2': pokemon.get('types')[1] if len(pokemon.get('types', [])) > 1 else None
            }
            all_rows.append(row)

    df = pd.DataFrame(all_rows)
    return df

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def process_battle_data_2(data, pokemon_db, is_train=True, max_moves=30):

    all_rows = []

    for battle in tqdm(data, desc="Processing battles"):
        battle_id = battle['battle_id']
        player_won = int(battle['player_won']) if is_train else None
        
        row = {'battle_id': battle_id}
        if is_train:
            row['player_won'] = player_won

        # --- P1 team stats ---
        p1_team = battle.get('p1_team_details', [])
        for i, pokemon in enumerate(p1_team):
            prefix = f'p1_pokemon{i+1}'
            row[f'{prefix}_name'] = pokemon.get('name')
            row[f'{prefix}_level'] = pokemon.get('level')
            row[f'{prefix}_base_hp'] = pokemon.get('base_hp')
            row[f'{prefix}_base_atk'] = pokemon.get('base_atk')
            row[f'{prefix}_base_def'] = pokemon.get('base_def')
            row[f'{prefix}_base_spa'] = pokemon.get('base_spa')
            row[f'{prefix}_base_spd'] = pokemon.get('base_spd')
            row[f'{prefix}_base_spe'] = pokemon.get('base_spe')
            types = pokemon.get('types', [])
            row[f'{prefix}_type1'] = types[0] if len(types) > 0 else None
            row[f'{prefix}_type2'] = types[1] if len(types) > 1 else None

        # --- P2 team stats (from first max_moves Pokémon used) ---
        p2_pokemon_seen = set()
        moves_scanned = 0
        for turn in battle['battle_timeline']:
            if moves_scanned >= max_moves:
                break
            p2_name = turn['p2_pokemon_state'].get('name')
            if p2_name and p2_name not in p2_pokemon_seen:
                p2_pokemon_seen.add(p2_name)
            moves_scanned += 1

        for i, name in enumerate(list(p2_pokemon_seen)):
            prefix = f'p2_pokemon{i+1}'
            stats = pokemon_db[pokemon_db['pokemon_name'] == name]
            if not stats.empty:
                stats = stats.iloc[0]  # take first occurrence
                row[f'{prefix}_name'] = name
                row[f'{prefix}_level'] = stats['level']
                row[f'{prefix}_base_hp'] = stats['base_hp']
                row[f'{prefix}_base_atk'] = stats['base_atk']
                row[f'{prefix}_base_def'] = stats['base_def']
                row[f'{prefix}_base_spa'] = stats['base_spa']
                row[f'{prefix}_base_spd'] = stats['base_spd']
                row[f'{prefix}_base_spe'] = stats['base_spe']
                row[f'{prefix}_type1'] = stats['type1']
                row[f'{prefix}_type2'] = stats['type2']
            else:
                # Fill with NaNs if Pokémon not found
                row[f'{prefix}_name'] = name
                row[f'{prefix}_level'] = np.nan
                row[f'{prefix}_base_hp'] = np.nan
                row[f'{prefix}_base_atk'] = np.nan
                row[f'{prefix}_base_def'] = np.nan
                row[f'{prefix}_base_spa'] = np.nan
                row[f'{prefix}_base_spd'] = np.nan
                row[f'{prefix}_base_spe'] = np.nan
                row[f'{prefix}_type1'] = None
                row[f'{prefix}_type2'] = None

        all_rows.append(row)

    df = pd.DataFrame(all_rows)
    return df

# === Usage ===
pokemon_db = build_pokemon_database(train_data)

train_team_df = process_battle_data_2(train_data, pokemon_db, is_train=True)
#test_team_df = process_battle_data_2(test_data, pokemon_db, is_train=False)

print(train_team_df.head())
print("Train team DataFrame shape:", train_team_df.shape)

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# --- Boolean switches ---
use_names = False
use_stats = False
use_types = True
use_aggregates = False  # mean/range/std features

# --- Identify stats columns ---
stat_cols = [c for c in train_team_df.columns if 'pokemon' in c and any(s in c for s in ['base_', 'level'])]

# --- Compute aggregates if toggled ---
if use_aggregates and stat_cols:
    for team in ['p1', 'p2']:
        team_stat_cols = [c for c in stat_cols if c.startswith(team)]
        if team_stat_cols:
            # Combined all stats
            stats_arr = train_team_df[team_stat_cols].values
            train_team_df[f'{team}_mean_all_stats'] = np.nanmean(stats_arr, axis=1)
            train_team_df[f'{team}_std_all_stats'] = np.nanstd(stats_arr, axis=1)
            train_team_df[f'{team}_range_all_stats'] = np.nanmax(stats_arr, axis=1) - np.nanmin(stats_arr, axis=1)

            # Per-stat aggregates
            for stat in ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe','level']:
                stat_cols_for_team = [c for c in team_stat_cols if c.endswith(stat)]
                if stat_cols_for_team:
                    arr = train_team_df[stat_cols_for_team].values
                    train_team_df[f'{team}_{stat}_mean'] = np.nanmean(arr, axis=1)
                    train_team_df[f'{team}_{stat}_std'] = np.nanstd(arr, axis=1)
                    train_team_df[f'{team}_{stat}_range'] = np.nanmax(arr, axis=1) - np.nanmin(arr, axis=1)

# --- Collect columns based on switches ---
feature_cols = []

if use_names:
    feature_cols += [c for c in train_team_df.columns if 'pokemon' in c and '_name' in c]
if use_stats:
    feature_cols += stat_cols
if use_types:
    feature_cols += [c for c in train_team_df.columns if 'pokemon' in c and '_type' in c]
if use_aggregates:
    feature_cols += [c for c in train_team_df.columns if c.endswith(('_mean','_std','_range'))]

# --- Prepare data ---
X_df = train_team_df[feature_cols].fillna(0)  # fill NaNs
y = train_team_df['player_won'].values

display(X_df.head())

# One-hot encode categorical columns (names/types)
categorical_cols = [c for c in X_df.columns if 'name' in c or 'type' in c]
X_df = pd.get_dummies(X_df, columns=categorical_cols, dummy_na=True)

X = X_df.values

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --- Train simple models ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else y_pred

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)

    results.append({
        "Model": name,
        "Val Accuracy": acc,
        "F1-Score": f1,
        "ROC-AUC": auc
    })
    print(f"{name}: Val Acc={acc:.3f}, F1={f1:.3f}, AUC={auc:.3f}")

results_df = pd.DataFrame(results)
print("\n=== Model Performance Summary ===")
print(results_df)
