In [None]:
# LIBRARY LOADING
import os
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)


# --- Define the path to our data ---
COMPETITION_NAME = 'fds-pokemon-battles-prediction-2025'
DATA_PATH = os.path.join('../input', COMPETITION_NAME)

train_file_path = os.path.join(DATA_PATH, 'train.jsonl')
test_file_path = os.path.join(DATA_PATH, 'test.jsonl')

printing = True
train_data = []
test_data = []


try:
    print(f"Loading data from '{train_file_path}'...")
    with open(train_file_path, 'r') as f:
        for line in f:
            train_data.append(json.loads(line))
    
    print(f"Loading data from '{test_file_path}'...")
    with open(test_file_path, 'r') as f:
        for line in f:
            test_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.\n")
    if printing: 
        # Inspect the first few battles
        for i, battle in enumerate(train_data[:5]):  # first 3 battles
            print(f"--- Battle {i+1} ---")
            print("Keys:", list(battle.keys()))
            
            # Show top-level values
            for key, value in battle.items():
                if key == 'battle_timeline':
                    print(f"{key}:")
                    for turn in value[:3]:  # show first 3 turns
                        print("   ", turn)
                    if len(value) > 3:
                        print("   ... (more turns)")
                else:
                    print(f"{key}: {value}")
            print("\n")
        
except Exception as e:
    print("Error loading data:", e)

Loading data from 'D:\fds-pokemon-battles-prediction-2025\train.jsonl'...
Loading data from 'D:\fds-pokemon-battles-prediction-2025\test.jsonl'...
Successfully loaded 10000 battles.

--- Battle 1 ---
Keys: ['player_won', 'p1_team_details', 'p2_lead_details', 'battle_timeline', 'battle_id']
player_won: True
p1_team_details: [{'name': 'starmie', 'level': 100, 'types': ['psychic', 'water'], 'base_hp': 60, 'base_atk': 75, 'base_def': 85, 'base_spa': 100, 'base_spd': 100, 'base_spe': 115}, {'name': 'exeggutor', 'level': 100, 'types': ['grass', 'psychic'], 'base_hp': 95, 'base_atk': 95, 'base_def': 85, 'base_spa': 125, 'base_spd': 125, 'base_spe': 55}, {'name': 'chansey', 'level': 100, 'types': ['normal', 'notype'], 'base_hp': 250, 'base_atk': 5, 'base_def': 5, 'base_spa': 105, 'base_spd': 105, 'base_spe': 50}, {'name': 'snorlax', 'level': 100, 'types': ['normal', 'notype'], 'base_hp': 160, 'base_atk': 110, 'base_def': 65, 'base_spa': 65, 'base_spd': 65, 'base_spe': 30}, {'name': 'tauros', '

In [44]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# === STEP 1: Build PokÃ©mon dictionary (same as before) ===
def build_pokemon_dict(data_list):
    pokemon_dict = {}

    for battle in data_list:
        # Player 1 full team
        for p in battle.get('p1_team_details', []):
            name = p['name']
            if name not in pokemon_dict:
                pokemon_dict[name] = {
                    'base_hp': p.get('base_hp', np.nan),
                    'base_atk': p.get('base_atk', np.nan),
                    'base_def': p.get('base_def', np.nan),
                    'base_spa': p.get('base_spa', np.nan),
                    'base_spd': p.get('base_spd', np.nan),
                    'base_spe': p.get('base_spe', np.nan)
                }

        # Player 2 lead PokÃ©mon
        p2 = battle.get('p2_lead_details', None)
        if p2:
            name = p2['name']
            if name not in pokemon_dict:
                pokemon_dict[name] = {
                    'base_hp': p2.get('base_hp', np.nan),
                    'base_atk': p2.get('base_atk', np.nan),
                    'base_def': p2.get('base_def', np.nan),
                    'base_spa': p2.get('base_spa', np.nan),
                    'base_spd': p2.get('base_spd', np.nan),
                    'base_spe': p2.get('base_spe', np.nan)
                }

    print(f"âœ… PokÃ©mon dictionary built with {len(pokemon_dict)} unique PokÃ©mon.")
    return pokemon_dict


# === STEP 2: Process battle data ===
def process_battle_data(data, pokemon_dict, is_train=True):
    all_rows = []

    for battle in tqdm(data, desc="Processing battles"):
        battle_id = battle['battle_id']
        player_won = int(battle['player_won']) if is_train else None

        # --- ðŸ†• NEW: Count unique PokÃ©mon for each player across the entire battle ---
        p1_team = [p['name'] for p in battle.get('p1_team_details', [])]
        if not p1_team:
            # fallback: extract from timeline
            p1_team = list({turn['p1_pokemon_state']['name'] for turn in battle.get('battle_timeline', [])})

        # Try to get P2 team (may appear as lead + switches)
        if 'p2_team_details' in battle:
            p2_team = [p['name'] for p in battle['p2_team_details']]
        else:
            # fallback: collect from timeline
            p2_team = list({turn['p2_pokemon_state']['name'] for turn in battle.get('battle_timeline', [])})

        num_unique_p1 = len(set(p1_team))
        num_unique_p2 = len(set(p2_team))

        # === Loop through each turn ===
        for turn in battle['battle_timeline']:
            turn_num = turn['turn']
            p1_state = turn['p1_pokemon_state']
            p2_state = turn['p2_pokemon_state']
            p1_move = turn.get('p1_move_details') or {}
            p2_move = turn.get('p2_move_details') or {}

            p1_name = p1_state.get('name', None)
            p2_name = p2_state.get('name', None)

            # Lookup base stats from PokÃ©mon dictionary
            p1_base = pokemon_dict.get(p1_name, {k: np.nan for k in ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']})
            p2_base = pokemon_dict.get(p2_name, {k: np.nan for k in ['base_hp','base_atk','base_def','base_spa','base_spd','base_spe']})

            row = {
                'battle_id': battle_id,
                'turn': turn_num,

                # ðŸ†• Add unique PokÃ©mon counts
                'num_unique_p1_pokemon_in_battle': num_unique_p1,
                'num_unique_p2_pokemon_in_battle': num_unique_p2,

                # PokÃ©mon identities
                'p1_pokemon_name': p1_name,
                'p2_pokemon_name': p2_name,

                # HP + status
                'p1_hp_pct': p1_state.get('hp_pct', np.nan),
                'p2_hp_pct': p2_state.get('hp_pct', np.nan),
                'p1_status': p1_state.get('status', 'nostatus'),
                'p2_status': p2_state.get('status', 'nostatus'),

                # Moves
                'p1_move_name': p1_move.get('name', None),
                'p1_move_type': p1_move.get('type', None),
                'p1_move_power': p1_move.get('base_power', 0),
                'p2_move_name': p2_move.get('name', None),
                'p2_move_type': p2_move.get('type', None),
                'p2_move_power': p2_move.get('base_power', 0),

                # Boosts
                'p1_boost_atk': p1_state.get('boosts', {}).get('atk', 0),
                'p1_boost_def': p1_state.get('boosts', {}).get('def', 0),
                'p1_boost_spa': p1_state.get('boosts', {}).get('spa', 0),
                'p1_boost_spd': p1_state.get('boosts', {}).get('spd', 0),
                'p1_boost_spe': p1_state.get('boosts', {}).get('spe', 0),
                'p2_boost_atk': p2_state.get('boosts', {}).get('atk', 0),
                'p2_boost_def': p2_state.get('boosts', {}).get('def', 0),
                'p2_boost_spa': p2_state.get('boosts', {}).get('spa', 0),
                'p2_boost_spd': p2_state.get('boosts', {}).get('spd', 0),
                'p2_boost_spe': p2_state.get('boosts', {}).get('spe', 0),

                # Base stats (from dictionary)
                **{f"p1_{k}": v for k, v in p1_base.items()},
                **{f"p2_{k}": v for k, v in p2_base.items()}
            }

            if is_train:
                row['player_won'] = player_won

            all_rows.append(row)

    # === Convert to DataFrame ===
    df = pd.DataFrame(all_rows)

    # One-hot encode categorical features
    categorical_cols = [
        'p1_pokemon_name', 'p2_pokemon_name',
        'p1_status', 'p2_status',
        'p1_move_type', 'p2_move_type',
        'p1_move_name', 'p2_move_name'
    ]
    df = pd.get_dummies(df, columns=categorical_cols, dummy_na=True)
    
    return df



# === STEP 3: Run everything ===
# Combine train and test data so dictionary includes all PokÃ©mon
pokemon_dict = build_pokemon_dict(train_data + test_data)

# Process datasets
train_df = process_battle_data(train_data, pokemon_dict, is_train=True)
print("âœ… Train DataFrame shape:", train_df.shape)

test_df = process_battle_data(test_data, pokemon_dict, is_train=False)
print("âœ… Test DataFrame shape:", test_df.shape)


âœ… PokÃ©mon dictionary built with 20 unique PokÃ©mon.


Processing battles: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10000/10000 [00:02<00:00, 3764.52it/s]


âœ… Train DataFrame shape: (300000, 201)


Processing battles: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:01<00:00, 3887.44it/s]


âœ… Test DataFrame shape: (150000, 200)


In [45]:
print(train_df.columns.tolist())

['battle_id', 'turn', 'num_unique_p1_pokemon_in_battle', 'num_unique_p2_pokemon_in_battle', 'p1_hp_pct', 'p2_hp_pct', 'p1_move_power', 'p2_move_power', 'p1_boost_atk', 'p1_boost_def', 'p1_boost_spa', 'p1_boost_spd', 'p1_boost_spe', 'p2_boost_atk', 'p2_boost_def', 'p2_boost_spa', 'p2_boost_spd', 'p2_boost_spe', 'p1_base_hp', 'p1_base_atk', 'p1_base_def', 'p1_base_spa', 'p1_base_spd', 'p1_base_spe', 'p2_base_hp', 'p2_base_atk', 'p2_base_def', 'p2_base_spa', 'p2_base_spd', 'p2_base_spe', 'player_won', 'p1_pokemon_name_alakazam', 'p1_pokemon_name_articuno', 'p1_pokemon_name_chansey', 'p1_pokemon_name_charizard', 'p1_pokemon_name_cloyster', 'p1_pokemon_name_dragonite', 'p1_pokemon_name_exeggutor', 'p1_pokemon_name_gengar', 'p1_pokemon_name_golem', 'p1_pokemon_name_jolteon', 'p1_pokemon_name_jynx', 'p1_pokemon_name_lapras', 'p1_pokemon_name_persian', 'p1_pokemon_name_rhydon', 'p1_pokemon_name_slowbro', 'p1_pokemon_name_snorlax', 'p1_pokemon_name_starmie', 'p1_pokemon_name_tauros', 'p1_pokemo

In [46]:
display(train_df)

Unnamed: 0,battle_id,turn,num_unique_p1_pokemon_in_battle,num_unique_p2_pokemon_in_battle,p1_hp_pct,p2_hp_pct,p1_move_power,p2_move_power,p1_boost_atk,p1_boost_def,...,p2_move_name_softboiled,p2_move_name_stunspore,p2_move_name_substitute,p2_move_name_surf,p2_move_name_swordsdance,p2_move_name_thunderbolt,p2_move_name_thunderwave,p2_move_name_toxic,p2_move_name_wrap,p2_move_name_nan
0,0,1,6,4,1.000000,0.689567,95,0,0,0,...,False,False,False,False,False,False,False,False,False,True
1,0,2,6,4,1.000000,1.000000,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
2,0,3,6,4,0.221374,1.000000,0,120,0,0,...,False,False,False,False,False,False,False,False,False,False
3,0,4,6,4,0.876245,1.000000,0,120,0,0,...,False,False,False,False,False,False,False,False,False,False
4,0,5,6,4,0.876245,0.495356,95,0,0,0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,9999,26,6,5,0.670000,1.000000,150,0,0,0,...,False,False,False,False,False,False,False,False,False,True
299996,9999,27,6,5,0.000000,0.110000,130,85,0,0,...,False,False,False,False,False,False,False,False,False,False
299997,9999,28,6,5,1.000000,0.110000,70,85,0,0,...,False,False,False,False,False,False,False,False,False,False
299998,9999,29,6,5,1.000000,0.280000,70,90,0,0,...,False,False,False,False,False,False,False,False,False,False


In [47]:
print(test_df.columns.tolist())

['battle_id', 'turn', 'num_unique_p1_pokemon_in_battle', 'num_unique_p2_pokemon_in_battle', 'p1_hp_pct', 'p2_hp_pct', 'p1_move_power', 'p2_move_power', 'p1_boost_atk', 'p1_boost_def', 'p1_boost_spa', 'p1_boost_spd', 'p1_boost_spe', 'p2_boost_atk', 'p2_boost_def', 'p2_boost_spa', 'p2_boost_spd', 'p2_boost_spe', 'p1_base_hp', 'p1_base_atk', 'p1_base_def', 'p1_base_spa', 'p1_base_spd', 'p1_base_spe', 'p2_base_hp', 'p2_base_atk', 'p2_base_def', 'p2_base_spa', 'p2_base_spd', 'p2_base_spe', 'p1_pokemon_name_alakazam', 'p1_pokemon_name_articuno', 'p1_pokemon_name_chansey', 'p1_pokemon_name_charizard', 'p1_pokemon_name_cloyster', 'p1_pokemon_name_dragonite', 'p1_pokemon_name_exeggutor', 'p1_pokemon_name_gengar', 'p1_pokemon_name_golem', 'p1_pokemon_name_jolteon', 'p1_pokemon_name_jynx', 'p1_pokemon_name_lapras', 'p1_pokemon_name_persian', 'p1_pokemon_name_rhydon', 'p1_pokemon_name_slowbro', 'p1_pokemon_name_snorlax', 'p1_pokemon_name_starmie', 'p1_pokemon_name_tauros', 'p1_pokemon_name_victree

In [48]:
# Keep the first player_won per battle (since it's constant across turns)
y_train_df = train_df.groupby("battle_id", as_index=False)["player_won"].first()

# Sum all other numeric columns per battle
X_train_df = train_df.groupby("battle_id", as_index=False).mean().drop(columns=["player_won"])

# Merge back into one clean DataFrame
train_df = X_train_df.merge(y_train_df, on="battle_id")

test_df = test_df.groupby("battle_id", as_index=False).mean()

train_df= train_df.drop(columns=["turn"])

test_df= test_df.drop(columns=["turn"])

In [49]:
nan_summary = pd.DataFrame({
    'NaN Count': train_df.isna().sum(),
    'NaN %': (train_df.isna().mean() * 100).round(2)
}).sort_values(by='NaN Count', ascending=False)

print(nan_summary)


                         NaN Count  NaN %
battle_id                        0    0.0
p1_move_name_psychic             0    0.0
p1_move_name_earthquake          0    0.0
p1_move_name_explosion           0    0.0
p1_move_name_fireblast           0    0.0
...                            ...    ...
p2_pokemon_name_zapdos           0    0.0
p2_pokemon_name_nan              0    0.0
p1_status_brn                    0    0.0
p1_status_fnt                    0    0.0
player_won                       0    0.0

[200 rows x 2 columns]


In [50]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

# === PREPARE DATA ===
train_df = train_df.sort_values(['battle_id'])
y = train_df.groupby('battle_id')['player_won'].first().values
feature_cols = [c for c in train_df.columns if c not in ['battle_id', 'player_won']]
X = train_df[feature_cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

seqs = [X_scaled[train_df['battle_id'] == bid] for bid in train_df['battle_id'].unique()]
X_agg = np.array([seq.mean(axis=0) for seq in seqs])   # Aggregate per battle

X_train, X_val, y_train, y_val = train_test_split(X_agg, y, test_size=0.2, random_state=42)

# === DEFINE MODELS & PARAM GRIDS ===
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10],
        "penalty": ["l2"],
        "solver": ["lbfgs", "saga"]
    },
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    },
    "XGBoost": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [4, 6, 8],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    },
    "LightGBM": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1, 0.2],
        "num_leaves": [15, 31, 63],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    }
}

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# === GRID SEARCH + EVALUATION ===
results = []

for name, model in models.items():
    print(f"\nðŸš€ Running Grid Search for {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")

    y_pred = best_model.predict(X_val)
    y_prob = best_model.predict_proba(X_val)[:, 1] if hasattr(best_model, "predict_proba") else y_pred

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)

    results.append({
        "Model": name,
        "Best Params": grid.best_params_,
        "Accuracy": acc,
        "F1-Score": f1,
        "ROC-AUC": auc
    })

    print(f"âœ… {name} â€” Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")

# === SUMMARY TABLE ===
results_df = pd.DataFrame(results)
print("\n=== Model Performance After Grid Search ===")
print(results_df)



ðŸš€ Running Grid Search for Logistic Regression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params for Logistic Regression: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
âœ… Logistic Regression â€” Acc: 0.840, F1: 0.842, AUC: 0.905

ðŸš€ Running Grid Search for Random Forest...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params for Random Forest: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}
âœ… Random Forest â€” Acc: 0.828, F1: 0.829, AUC: 0.890

ðŸš€ Running Grid Search for Gradient Boosting...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}
âœ… Gradient Boosting â€” Acc: 0.828, F1: 0.829, AUC: 0.907

ðŸš€ Running Grid Search for XGBoost...
Fitting 3 folds for each of 72 candidates, totalling 216 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.8}
âœ… XGBoost â€” Acc: 0.839, F1: 0.841, AUC: 0.906

ðŸš€ Running Grid Search for LightGBM...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7245
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 189
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498500 -> initscore=-0.006000
[LightGBM] [Info] Start training from score -0.006000
Best params for LightGBM: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 15, 'subsample': 0.8}
âœ… LightGBM â€” Acc: 0.835, F1: 0.835, AUC: 0.908

=== Model Performance After Grid Search =



In [51]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

# === PREPARE DATA ===
train_df = train_df.sort_values(['battle_id'])
y = train_df.groupby('battle_id')['player_won'].first().values
feature_cols = [c for c in train_df.columns if c not in ['battle_id', 'player_won']]
X = train_df[feature_cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

seqs = [X_scaled[train_df['battle_id'] == bid] for bid in train_df['battle_id'].unique()]
X_agg = np.array([seq.mean(axis=0) for seq in seqs])   # Aggregate per battle

X_train, X_val, y_train, y_val = train_test_split(X_agg, y, test_size=0.2, random_state=42)

# === DEFINE MODELS ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric='logloss', use_label_encoder=False, random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=200, learning_rate=0.1, num_leaves=31,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    )
}

# === TRAIN & EVALUATE ===
results = []
for name, model in models.items():
    print(f"\nðŸš€ Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else y_pred

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)

    results.append({"Model": name, "Accuracy": acc, "F1-Score": f1, "ROC-AUC": auc})
    print(f"âœ… {name} â€” Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")

# === SUMMARY TABLE ===
results_df = pd.DataFrame(results)
print("\n=== Model Performance Summary ===")
print(results_df)



ðŸš€ Training Logistic Regression...
âœ… Logistic Regression â€” Acc: 0.843, F1: 0.844, AUC: 0.906

ðŸš€ Training Random Forest...
âœ… Random Forest â€” Acc: 0.827, F1: 0.827, AUC: 0.890

ðŸš€ Training Gradient Boosting...
âœ… Gradient Boosting â€” Acc: 0.830, F1: 0.831, AUC: 0.908

ðŸš€ Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


âœ… XGBoost â€” Acc: 0.842, F1: 0.843, AUC: 0.908

ðŸš€ Training LightGBM...
[LightGBM] [Info] Number of positive: 3988, number of negative: 4012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7245
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 189
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498500 -> initscore=-0.006000
[LightGBM] [Info] Start training from score -0.006000
âœ… LightGBM â€” Acc: 0.837, F1: 0.837, AUC: 0.904

=== Model Performance Summary ===
                 Model  Accuracy  F1-Score   ROC-AUC
0  Logistic Regression    0.8425  0.844444  0.906148
1        Random Forest    0.8265  0.827449  0.890055
2    Gradient Boosting    0.8300  0.830677  0.907783
3              XGBoost    0.8415  0.842836  0.907853
4             LightGBM    0.8365  0.836908  0.904253




In [52]:
# === PREPARE TEST DATA ===
test_df = test_df.sort_values(['battle_id'])
X_test_full = test_df[feature_cols].values
X_test_scaled = scaler.transform(X_test_full)

# Aggregate per battle (just like train)
seqs_test = [X_test_scaled[test_df['battle_id'] == bid] for bid in test_df['battle_id'].unique()]
X_test_agg = np.array([seq.mean(axis=0) for seq in seqs_test])

# === CHOOSE BEST MODEL (example: XGBoost) ===
best_model = xgb.XGBClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', use_label_encoder=False, random_state=42
)
best_model.fit(X_agg, y)  # train on full data

# === PREDICT ON TEST SET ===
y_test_pred = best_model.predict_proba(X_test_agg)[:, 1]
y_test_pred_int = (y_test_pred > 0.5).astype(int)

# === CREATE SUBMISSION CSV ===
submission = pd.DataFrame({
    'battle_id': test_df['battle_id'].unique(),
    'player_won': y_test_pred_int
})
submission.to_csv(r'D:\fds-pokemon-battles-prediction-2025\submissions_xgbboost.csv', index=False)
print("âœ… Saved csv successfully!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


âœ… Saved csv successfully!
