In [None]:
# LOAD JSONS   train_data e test_data

import json
import pandas as pd
import os

# --- Define the path to our data ---
train_file_path = 'train.jsonl'
test_file_path  = 'test.jsonl'

train_data = []
test_data  = []

# --- Load TRAIN data ---
print(f"📦 Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            train_data.append(json.loads(line))
    print(f"✅ Successfully loaded {len(train_data)} battles from train.")
    
    # Show structure of first train battle
    if train_data:
        print("\n--- Structure of the first train battle: ---")
        first_battle = train_data[0]
        battle_for_display = first_battle.copy()
        battle_for_display['battle_timeline'] = first_battle.get('battle_timeline', [])[:2]
        print(json.dumps(battle_for_display, indent=4))
        if len(first_battle.get('battle_timeline', [])) > 3:
            print("    ...")
            print("    (battle_timeline has been truncated for display)")

except FileNotFoundError:
    print(f"❌ ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")


# --- Load TEST data ---
print(f"\n📦 Loading data from '{test_file_path}'...")
try:
    with open(test_file_path, 'r') as f:
        for line in f:
            test_data.append(json.loads(line))
    print(f"✅ Successfully loaded {len(test_data)} battles from test.")
    
    # Optional: inspect the first test battle
    if test_data:
        print("\n--- Structure of the first test battle: ---")
        first_test_battle = test_data[0]
        test_display = first_test_battle.copy()
        test_display['battle_timeline'] = test_display.get('battle_timeline', [])[:2]
        print(json.dumps(test_display, indent=4))
        if len(first_test_battle.get('battle_timeline', [])) > 3:
            print("    ...")
            print("    (battle_timeline has been truncated for display)")

except FileNotFoundError:
    print(f"❌ ERROR: Could not find the test file at '{test_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")


In [2]:
#Per vedere i dati in dataframe, divisi in 4 blocchi dove squadra==p1 e pokemon==p2
import pandas as pd

def create_dataframe(data) -> []:
    df=pd.DataFrame(data)

    # DataFrame generale con info di base
    df_battle = pd.DataFrame([{
        "battle_id": b["battle_id"],
        "player_won": b["player_won"]
    } for b in train_data])

    # DataFrame con i Pokémon della squadra del giocatore (p1_team_details)
    df_squad = pd.DataFrame([
        {
            "battle_id": b["battle_id"],
            "pokemon_name": p["name"],
            "level": p["level"],
            "types": p["types"],
            "base_hp": p["base_hp"],
            "base_atk": p["base_atk"],
            "base_def": p["base_def"],
            "base_spa": p["base_spa"],
            "base_spd": p["base_spd"],
            "base_spe": p["base_spe"]
        }
        for b in train_data
        for p in b["p1_team_details"]
    ])

    # DataFrame con dettagli del lead Pokémon dell'avversario
    df_pokemon = pd.DataFrame([
        {
            "battle_id": b["battle_id"],
            "name": b["p2_lead_details"]["name"],
            "level": b["p2_lead_details"]["level"],
            "types": b["p2_lead_details"]["types"],
            "base_hp": b["p2_lead_details"]["base_hp"],
            "base_atk": b["p2_lead_details"]["base_atk"],
            "base_def": b["p2_lead_details"]["base_def"],
            "base_spa": b["p2_lead_details"]["base_spa"],
            "base_spd": b["p2_lead_details"]["base_spd"],
            "base_spe": b["p2_lead_details"]["base_spe"]
        }
        for b in train_data
    ])

    # DataFrame con la timeline dei turni
    df_battle_timeline = pd.DataFrame([
        {
            "battle_id": b["battle_id"],
            "turn": t["turn"],
            "p1_pokemon": t["p1_pokemon_state"]["name"],
            "p1_hp": t["p1_pokemon_state"]["hp_pct"],
            "p1_status": t["p1_pokemon_state"]["status"],
            "p1_effects": t["p1_pokemon_state"]["effects"],
            "p1_boosts": t["p1_pokemon_state"]["boosts"],
            "p2_pokemon": t["p2_pokemon_state"]["name"],
            "p2_hp": t["p2_pokemon_state"]["hp_pct"],
            "p2_status": t["p2_pokemon_state"]["status"],
            "p2_effects": t["p2_pokemon_state"]["effects"],
            "p2_boosts": t["p2_pokemon_state"]["boosts"],
            "p1_move_name": t["p1_move_details"]["name"] if t["p1_move_details"] else None,
            "p1_move_type": t["p1_move_details"]["type"] if t["p1_move_details"] else None,
            "p1_move_cat": t["p1_move_details"]["category"] if t["p1_move_details"] else None,
            "p1_move_basepow": t["p1_move_details"]["base_power"] if t["p1_move_details"] else None,
            "p1_move_acc": t["p1_move_details"]["accuracy"] if t["p1_move_details"] else None,
            "p1_move_priority": t["p1_move_details"]["priority"] if t["p1_move_details"] else None,
            "p2_move_name": t["p2_move_details"]["name"] if t["p2_move_details"] else None,
            "p2_move_type": t["p2_move_details"]["type"] if t["p2_move_details"] else None,
            "p2_move_cat": t["p2_move_details"]["category"] if t["p2_move_details"] else None,
            "p2_move_basepow": t["p2_move_details"]["base_power"] if t["p2_move_details"] else None,
            "p2_move_acc": t["p2_move_details"]["accuracy"] if t["p2_move_details"] else None,
            "p2_move_priority": t["p2_move_details"]["priority"] if t["p2_move_details"] else None
        }
        for b in train_data
        for t in b["battle_timeline"]
    ])

    return [df_battle, df_squad, df_pokemon, df_battle_timeline]
#train_data e test_data
#battle,squad,pokemon,timeline
train_list=create_dataframe(train_data)
test_list=create_dataframe(test_data)

#DEBUG
print(len(train_list))
print(len(test_list))


4
4


In [3]:
#Tutti i tipi di pokemon presenti nel mio dataset
df_pokemon=train_list[2]
df_squad=train_list[1]
unique_comb_types = pd.Series(
    list(df_pokemon["types"].apply(tuple).unique()) + 
    list(df_squad["types"].apply(tuple).unique())
).drop_duplicates().tolist()

print(unique_comb_types)
print("len combo: ",len(unique_comb_types))

unique_types = sorted(
    set(
        t
        for types_list in pd.concat([df_pokemon["types"], df_squad["types"]])
        for t in types_list
    )
)

print(unique_types)
print("len types: ",len(unique_types))

#print("Effect\n",df_battle_timeline["p1_effects"].apply(tuple).unique())
#print("Move cat\n",df_battle_timeline["p1_move_cat"].unique())
#print("Move type\n",df_battle_timeline["p1_move_type"].unique())

[('psychic', 'water'), ('notype', 'psychic'), ('normal', 'notype'), ('electric', 'notype'), ('ice', 'psychic'), ('ghost', 'poison'), ('grass', 'psychic'), ('grass', 'poison'), ('ice', 'water'), ('ground', 'rock'), ('electric', 'flying'), ('dragon', 'flying'), ('fire', 'flying'), ('flying', 'ice')]
len combo:  14
['dragon', 'electric', 'fire', 'flying', 'ghost', 'grass', 'ground', 'ice', 'normal', 'notype', 'poison', 'psychic', 'rock', 'water']
len types:  14


In [None]:
#controllo che tutti i round arrivino a 30 round
df_battle_timeline=train_list[3]
print(df_battle_timeline[df_battle_timeline["turn"]==30].count())
#ogni match dura 30 round

In [5]:
#controllo per vedere se le classi non sono bilanciate, troppi vincitori nel dataset
df_battle=train_list[0]
df_battle["player_won"]=df_battle["player_won"].astype(int)
perc=df_battle["player_won"].sum()*100/len(df_battle["player_won"])
print("percentuale di vincitori nel nostro dataset: ",perc)

percentuale di vincitori nel nostro dataset:  50.0


FUNZIONI

In [None]:
#funzione per ottenere tutti i tipi di status e effetti
import pandas as pd
def unique_se(data_list):
    df_battle_timeline=data_list[3]
    # Unione dei due campi status
    all_status = pd.concat([
        df_battle_timeline['p1_status'],
        df_battle_timeline['p2_status']
    ], ignore_index=True)
    unique_status = (
        all_status.dropna()
        .astype(str)
        .unique()
        .tolist()
    )
    unique_status = (
        all_status.dropna()
        .astype(str)
        .unique()
        .tolist()
    )   
    print("status unici:", unique_status)

    all_effects = []

    for col in ["p1_effects", "p2_effects"]:
        for row in df_battle_timeline[col].dropna():
            if isinstance(row, list):
                all_effects.extend(row)
            elif isinstance(row, str):
                all_effects.append(row)

    unique_effects = sorted(set(all_effects))
    print("EFFECTS unici:", unique_effects)
    return unique_status,unique_effects

#unique_status,unique_effects=unique_se(train_list)


status unici: ['nostatus', 'par', 'slp', 'fnt', 'frz', 'tox', 'psn', 'brn']
EFFECTS unici: ['clamp', 'confusion', 'firespin', 'noeffect', 'reflect', 'substitute', 'typechange', 'wrap']


In [4]:
#Funzione per ottenere i tipi dei pokemon
def unique_t(data_list):
    df_pokemon=data_list[2]
    df_squad=data_list[1]
    unique_types = sorted(
        set(
            t
            for types_list in pd.concat([df_pokemon["types"], df_squad["types"]])
            for t in types_list
        )
    )
    return unique_types

PRIMO MODELLO STATICO

In [36]:
#Creazione del modello da allenare(Statico, prima del match)
import pandas as pd
def extract_feature(data_list):
    df_battle = data_list[0]
    df_pokemon = data_list[2]
    df_squad = data_list[1]

    unique_types = unique_t(data_list)  # funzione che restituisce tutti i tipi unici

    # --- PLAYER 1 ---
    df_squad["types_clean"] = df_squad["types"].apply(lambda x: [t for t in x if t != "notype"])
    agg_squad1 = df_squad.groupby("battle_id").agg({
        "base_hp": "mean",
        "base_atk": "mean",
        "base_def": "mean",
        "base_spa": "mean",
        "base_spd": "mean",
        "base_spe": "mean",
        "level": "mean",
        "types_clean": lambda lst: [t for sub in lst for t in sub]
    }).reset_index()
    for t in unique_types:
        agg_squad1[t] = agg_squad1["types_clean"].apply(lambda lst: lst.count(t))
    agg_squad1 = agg_squad1.drop(columns=["types_clean"]).add_prefix("p1_").rename(columns={"p1_battle_id": "battle_id"})

    # --- PLAYER 2 ---
    df_squad2 = df_pokemon.copy()
    df_squad2["types_clean"] = df_squad2["types"].apply(lambda x: [t for t in x if t != "notype"])
    for t in unique_types:
        df_squad2[t] = df_squad2["types_clean"].apply(lambda lst: lst.count(t))
    df_squad2 = df_squad2.drop(columns=["types_clean","types","name"], errors="ignore")
    agg_squad2 = df_squad2.add_prefix("p2_").rename(columns={"p2_battle_id": "battle_id"})

    # --- UNIONE FINALE ---
    agg_full = agg_squad1.merge(agg_squad2, on="battle_id", how="inner")
    agg_full = agg_full.merge(df_battle[["battle_id","player_won"]], on="battle_id", how="left")
    agg_full["player_won"] = agg_full["player_won"].astype(int)

    return agg_full.fillna(0)


#fin_model=extract_feature(train_list)
#fin_test_model=extract_feature(test_list)
#print(fin_model.columns)

SCALER

In [7]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def scale_features(df: pd.DataFrame, exclude_cols: list = None):
    """
    Applica StandardScaler alle colonne numeriche di un DataFrame,
    escludendo quelle in exclude_cols (ad es. 'battle_id', 'player_won').

    Args:
        df (pd.DataFrame): il DataFrame da scalare
        exclude_cols (list, optional): colonne da escludere dallo scaling

    Returns:
        df_scaled (pd.DataFrame): DataFrame con feature scalate
        scaler (StandardScaler): oggetto scaler già fit
    """
    if exclude_cols is None:
        exclude_cols = []

    # Seleziona le colonne da scalare
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    # Crea una copia per non modificare l'originale
    df_scaled = df.copy()

    # Inizializza lo scaler e applica fit_transform
    scaler = StandardScaler()
    df_scaled[feature_cols] = scaler.fit_transform(df_scaled[feature_cols])

    return df_scaled, scaler

BASIC TRAINING DEL MODELLO BASE(INDICATIVO)

In [8]:
#TRAIN MODEL(base)

from sklearn.linear_model import LogisticRegression

train_df=extract_feature(train_list)#our model
test_df=extract_feature(test_list)
# Define our features (X) and target (y)
exclude_cols=['battle_id', 'p1_dragon', 'p1_electric',
       'p1_fire', 'p1_flying', 'p1_ghost', 'p1_grass', 'p1_ground', 'p1_ice',
       'p1_normal', 'p1_notype', 'p1_poison', 'p1_psychic', 'p1_rock',
       'p1_water', 'p2_dragon', 'p2_electric','p2_fire', 'p2_flying',
        'p2_ghost', 'p2_grass', 'p2_ground', 'p2_ice','p2_normal',
        'p2_notype', 'p2_poison', 'p2_psychic', 'p2_rock',
       'p2_water', 'player_won']
features = [col for col in train_df.columns if col not in exclude_cols]

X_train, scaler = scale_features(train_df, exclude_cols=exclude_cols)

y_train = train_df['player_won']

# Applica lo stesso scaler al test set
X_test_scaled = test_df.copy()
X_test_scaled[features] = scaler.transform(X_test_scaled[features])

# Initialize and train the model
print("Training a simple Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=5000)
model.fit(X_train, y_train)
print("Model training complete.")

Training a simple Logistic Regression model...
Model training complete.


In [None]:
# Make predictions on the test data
print("Generating predictions on the test set...")
X_test=X_test_scaled
y_test_pred = model.predict(X_test)

# Probabilità di vittoria (utile per metriche tipo ROC AUC)
y_test_prob = model.predict_proba(X_test_scaled)[:,1]

# Controlla le prime predizioni
print(y_test_pred[:10])
print(y_test_prob[:10])

MODELLO STATICO BASE (ACC=0.51), BASELINE

In [None]:
#CROSS-VALIDATION( K-FOLD )
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

train_df=extract_feature(train_list)

#colonne da non scalare
type_cols=[ 'p1_dragon', 'p1_electric','p1_fire', 'p1_flying',
            'p1_ghost', 'p1_grass', 'p1_ground', 'p1_ice',
       'p1_normal', 'p1_notype', 'p1_poison', 'p1_psychic', 'p1_rock',
       'p1_water', 'p2_dragon', 'p2_electric','p2_fire', 'p2_flying',
        'p2_ghost', 'p2_grass', 'p2_ground', 'p2_ice','p2_normal',
        'p2_notype', 'p2_poison', 'p2_psychic', 'p2_rock','p2_water']
exclude=['battle_id','player_won']+type_cols

features = [col for col in train_df.columns if col not in exclude_cols]

#Crea il ColumnTransformer che applichera lo scaler
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), features),
        ('pass_types', 'passthrough', type_cols)
    ],
    remainder='drop'
)

#Pipeline per il modello, applica lo scaler dentro ogni fold
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=5000, random_state=42))
])

#Cross validation
X = train_df.drop(columns=['player_won', 'battle_id'])
y = train_df['player_won']

# K-Fold stratificato
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for metric in ['accuracy', 'precision', 'recall', 'f1','roc_auc']:
    s = cross_val_score(pipe, X, y, cv=cv, scoring=metric)
    print(f"{metric.capitalize():<10}: mean={s.mean():.3f} ± {s.std():.3f}")

y_pred = cross_val_predict(pipe, X, y, cv=cv)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred, digits=3))

In [None]:
#FEATURE IMPORTANCE , dopo aver allenato il modello 
import pandas as pd
pipe.fit(X, y)
# Estrai il modello finale dalla pipeline
model = pipe.named_steps['model']
feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()
coefs = model.coef_[0]

# Crea un DataFrame per associarli ai nomi delle feature
importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs,
    'abs_coeff': np.abs(coefs)
})

# Ordina per importanza assoluta
importance = importance.sort_values('abs_coeff', ascending=False)

print("\n🔝 Top 15 feature più influenti:")
print(importance.head(15))

BASELINE CON DATI DINAMICI(OTTENUTI DURANTE IL MATCH)

funzione feature extraction

In [None]:
#Creazione del modello da allenare(Dinamico , primi 30 round)
def extract_all(train_list):
    unique_status,unique_effects=unique_se(train_list)
    static_features=extract_feature(train_list)
    df_battle_timeline = train_list[3]

    #Extract dynamic feature
    dynamic_features = df_battle_timeline.groupby('battle_id').agg({
        'p1_hp': ['mean', 'last'],
        'p2_hp': ['mean', 'last'],
        'p1_move_basepow': 'mean',
        'p2_move_basepow': 'mean',
        'p1_move_acc': 'mean',
        'p2_move_acc': 'mean',
    }).reset_index()
    dynamic_features.columns = ['battle_id'] + [f"{a}_{b}" for a, b in dynamic_features.columns if a != 'battle_id']
    #dynamic_features['hp_diff_mean'] = dynamic_features['p1_hp_mean'] - dynamic_features['p2_hp_mean']

    #Merge dataset
    train_df = (
        static_features
        .merge(dynamic_features, on='battle_id', how='left')
    )
    print(train_df.columns)
    return train_df.fillna(0)

training con ACC=0.71 

In [37]:
#####################  Train new model-dynamics ##########################
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

train_df=extract_all(train_list)

X = train_df.drop(columns=['player_won', 'battle_id'])
y = train_df['player_won']
#colonne da non scalare
type_cols=[ 'p1_dragon', 'p1_electric','p1_fire', 'p1_flying',
            'p1_ghost', 'p1_grass', 'p1_ground', 'p1_ice',
       'p1_normal', 'p1_notype', 'p1_poison', 'p1_psychic', 'p1_rock',
       'p1_water', 'p2_dragon', 'p2_electric','p2_fire', 'p2_flying',
        'p2_ghost', 'p2_grass', 'p2_ground', 'p2_ice','p2_normal',
        'p2_notype', 'p2_poison', 'p2_psychic', 'p2_rock','p2_water']
exclude=['battle_id','player_won']+type_cols

features = [col for col in train_df.columns if col not in exclude]
#debug 
print(features)


preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), features),
    ('pass_types', 'passthrough', type_cols)
], remainder='drop')

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=5000, random_state=42))
])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for metric in ['accuracy', 'precision', 'recall', 'f1','roc_auc']:
    s = cross_val_score(pipe, X, y, cv=cv, scoring=metric)
    print(f"{metric.capitalize():<10}: mean={s.mean():.3f} ± {s.std():.3f}")

y_pred = cross_val_predict(pipe, X, y, cv=cv)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred, digits=3))


Index(['battle_id', 'p1_base_hp', 'p1_base_atk', 'p1_base_def', 'p1_base_spa',
       'p1_base_spd', 'p1_base_spe', 'p1_level', 'p1_dragon', 'p1_electric',
       'p1_fire', 'p1_flying', 'p1_ghost', 'p1_grass', 'p1_ground', 'p1_ice',
       'p1_normal', 'p1_notype', 'p1_poison', 'p1_psychic', 'p1_rock',
       'p1_water', 'p2_level', 'p2_base_hp', 'p2_base_atk', 'p2_base_def',
       'p2_base_spa', 'p2_base_spd', 'p2_base_spe', 'p2_dragon', 'p2_electric',
       'p2_fire', 'p2_flying', 'p2_ghost', 'p2_grass', 'p2_ground', 'p2_ice',
       'p2_normal', 'p2_notype', 'p2_poison', 'p2_psychic', 'p2_rock',
       'p2_water', 'player_won', 'p1_hp_mean', 'p1_hp_last', 'p2_hp_mean',
       'p2_hp_last', 'p1_move_basepow_mean', 'p2_move_basepow_mean',
       'p1_move_acc_mean', 'p2_move_acc_mean'],
      dtype='object')
['p1_base_hp', 'p1_base_atk', 'p1_base_def', 'p1_base_spa', 'p1_base_spd', 'p1_base_spe', 'p1_level', 'p2_level', 'p2_base_hp', 'p2_base_atk', 'p2_base_def', 'p2_base_spa', 'p2_

In [32]:
#Analisi feauture per modello dinamico 
from sklearn.inspection import permutation_importance

pipe.fit(X, y)
r = permutation_importance(pipe, X, y, n_repeats=5, random_state=42)
sorted(zip(r.importances_mean, X.columns), reverse=True)[:10]

[(np.float64(0.14378000000000002), 'p1_hp_mean'),
 (np.float64(0.05406000000000004), 'p1_base_hp'),
 (np.float64(0.04044000000000005), 'p2_hp_mean'),
 (np.float64(0.03526000000000005), 'p1_psychic'),
 (np.float64(0.028360000000000073), 'p2_psychic'),
 (np.float64(0.02110000000000003), 'p1_ice'),
 (np.float64(0.018140000000000045), 'p1_base_spe'),
 (np.float64(0.015220000000000056), 'p1_base_def'),
 (np.float64(0.012560000000000038), 'p2_hp_last'),
 (np.float64(0.010540000000000039), 'p1_base_atk')]