In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import networkx as nx
import pandas as pd
import os
from tqdm import tqdm
from multiprocessing import Pool
import pickle
import random
import torch 
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

pd.set_option('display.max_columns', None)

# 1. Chargement et préparation

df = pd.read_csv("data/test2.csv", encoding="utf-8")
df[df["j1"] == df["j2"]].index
df.drop(index=df[df["j1"] == df["j2"]].index,inplace=True)
def fonction_un_nom(df):
    # Paramètres Elo
    starting_elo = 1500
    K = 32  # Facteur de sensibilité

    # Nettoyer et normaliser la colonne 'surface'
    df['surface'] = df['surface'].str.strip().str.upper()

    # Mapping pour les surfaces
    surface_mapping = {
        "DUR": 1,
        "TERRE BATTUE": 2,
        "DUR (INDOOR)": 3,
        "GAZON": 4
    }
    df['surface_encoded'] = df['surface'].map(surface_mapping)
    all_tournois = df["tournament"].unique()
    tourn_encoder = {tournois: idx for idx, tournois in enumerate(all_tournois)}

    df["tournament_enc"] = df["tournament"].map(tourn_encoder)

    # Dictionnaire global d'Elo
    elo_ratings = {}
    # Dictionnaires pour l'Elo par surface : 
    # Pour chaque surface, on initialise un dictionnaire pour stocker les Elo des joueurs.
    elo_surface_ratings = {surf: {} for surf in surface_mapping.keys()}
    elo_tourna_ratings = {tourn: {} for tourn in tourn_encoder.keys()}

    # Listes pour stocker les valeurs d'Elo AVANT mise à jour pour chaque match
    elo_j1 = []
    elo_j2 = []
    elo_j1_surface = []
    elo_j2_surface = []
    elo_j1_tournoi = []
    elo_j2_tournoi = []
    gain_j1 = []
    gain_j2 = []

    # Fonctions d'accès aux ratings (en cas d'absence, retourne starting_elo)
    def get_elo(player):
        return elo_ratings.get(player, starting_elo)
    
    def get_elo_surface(player, surf):
        return elo_surface_ratings[surf].get(player, starting_elo)
    def get_elo_tournois(player, tourn):
        return elo_tourna_ratings[tourn].get(player, starting_elo)

    # Parcours du DataFrame match par match
    for idx, row in df.iterrows():
        # Récupération des identifiants et de la surface
        player1 = row["j1"]
        player2 = row["j2"]
        winner = row["winner"]
        surf = row["surface"]  # par exemple "DUR", "TERRE BATTUE", etc.
        tourn = row["tournament"]
        # ---------------------------
        # Mise à jour de l'Elo global
        # ---------------------------
        current_R1 = get_elo(player1)
        current_R2 = get_elo(player2)
        # Stocker les ratings avant mise à jour pour ce match
        elo_j1.append(current_R1)
        elo_j2.append(current_R2)
        # Calcul des scores attendus
        E1 = 1 / (1 + 10 ** ((current_R2 - current_R1) / 400))
        E2 = 1 / (1 + 10 ** ((current_R1 - current_R2) / 400))
        # Scores réels : 1 pour la victoire, 0 pour la défaite
        S1 = 1 if winner == player1 else 0
        S2 = 1 if winner == player2 else 0
        gain1 = K * (S1 - E1)
        gain2 = K * (S2 - E2)
        gain_j1.append(gain1)
        gain_j2.append(gain2)

        # Mise à jour globale
        new_R1 = current_R1 + K * (S1 - E1)
        new_R2 = current_R2 + K * (S2 - E2)
        # Actualiser le dictionnaire
        elo_ratings[player1] = new_R1
        elo_ratings[player2] = new_R2

        # ---------------------------
        # Mise à jour de l'Elo par surface
        # ---------------------------
        current_R1_surf = get_elo_surface(player1, surf)
        current_R2_surf = get_elo_surface(player2, surf)
        elo_j1_surface.append(current_R1_surf)
        elo_j2_surface.append(current_R2_surf)
        # Calcul des scores attendus pour la surface
        E1_surf = 1 / (1 + 10 ** ((current_R2_surf - current_R1_surf) / 400))
        E2_surf = 1 / (1 + 10 ** ((current_R1_surf - current_R2_surf) / 400))
        # Mise à jour par surface
        new_R1_surf = current_R1_surf + K * (S1 - E1_surf)
        new_R2_surf = current_R2_surf + K * (S2 - E2_surf)
        # Actualiser le dictionnaire pour la surface correspondante
        elo_surface_ratings[surf][player1] = new_R1_surf
        elo_surface_ratings[surf][player2] = new_R2_surf


        current_R1_tourn = get_elo_tournois(player1, tourn)
        current_R2_tourn = get_elo_tournois(player2, tourn)
        elo_j1_tournoi.append(current_R1_tourn)
        elo_j2_tournoi.append(current_R2_tourn)
        # Calcul des scores attendus pour le tournoi
        E1_tourn = 1 / (1 + 10 ** ((current_R2_tourn - current_R1_tourn) / 400))
        E2_tourn = 1 / (1 + 10 ** ((current_R1_tourn - current_R2_tourn) / 400))
        new_R1_tourn = current_R1_tourn + K * (S1 - E1_tourn)
        new_R2_tourn = current_R2_tourn + K * (S2 - E2_tourn)
        elo_tourna_ratings[tourn][player1] = new_R1_tourn
        elo_tourna_ratings[tourn][player2] = new_R2_tourn
    # Ajout des colonnes d'Elo au DataFrame
    df["elo_j1"] = elo_j1
    df["elo_j2"] = elo_j2
    df["elo_j1_surface"] = elo_j1_surface
    df["elo_j2_surface"] = elo_j2_surface
    df["elo_j1_tourn"] = elo_j1_tournoi
    df["elo_j2_tourn"] = elo_j2_tournoi
    df["gain_j1"] = gain_j1
    df["gain_j2"] = gain_j2

    return df


#############################################
# 2. Préparation des données
#############################################
df['target'] = (df['winner'] == df['j1']).astype(int)
print(df["target"].value_counts())
# 2) Comptage
# 2) Comptage initial
n_pos = (df["target"] == 1).sum()
n_neg = (df["target"] == 0).sum()

# 3) Calcule combien inverser pour équilibrer
#    Chaque inversion retire 1 de la classe majoritaire et ajoute 1 à la minoritaire.
#    Il en faut donc k = (|n_pos - n_neg|) // 2 inversions.
if n_pos > n_neg:
    maj_cls     = 1
    inv_to_cls  = 0
    diff        = n_pos - n_neg
else:
    maj_cls     = 0
    inv_to_cls  = 1
    diff        = n_neg - n_pos

k = diff // 2

# 4) Tire au hasard k index de la classe majoritaire
idx_to_invert = df[df["target"] == maj_cls].sample(n=k, random_state=42).index

# 5) Pour ces indices, réalise l’inversion en place
#    - swap j1/j2 et toutes les colonnes “signées”
#    - flip target
swap_pairs = [
    ("j1", "j2"),
    ("score_j1", "score_j2"),
    ("Doubles_fautes_j1", "Doubles_fautes_j2"),
    ("%_1er_Service_j1", "%_1er_Service_j2"),
    ("Jeux_de_Serv._Gagnés_j1", "Jeux_de_Serv._Gagnés_j2"),
    ("rank1", "rank2"),
    ("age1", "age2"),
    ("point1", "point2"),
    ("elo_j1", "elo_j2"),
    ("%_1er_Service_j1_perc", "%_1er_Service_j2_perc"),
]

# On travaille sur une copie des lignes à inverser
df_inv = df.loc[idx_to_invert].copy()

# Swap de chaque paire
for c1, c2 in swap_pairs:
    df_inv[c1], df_inv[c2] = df_inv[c2].values, df_inv[c1].values

# Et inversion du target
df_inv["target"] = inv_to_cls

# 6) Remplace en place les lignes originales par leur version inversée
df.loc[idx_to_invert, :] = df_inv

# 7) Vérifie la nouvelle distribution
print(df["target"].value_counts())
# Doit afficher quasiment 50/50 (éventuellement 1 de différence si diff était impair)

target
1    39350
0    33845
Name: count, dtype: int64
target
1    36598
0    36597
Name: count, dtype: int64


  df.loc[idx_to_invert, :] = df_inv


In [3]:
df = fonction_un_nom(df)
df.drop(columns=[col for col in df.columns if "Unnamed:" in col], inplace=True)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values('date').reset_index(drop=True)


# 2. Encodage joueurs
all_players = pd.concat([df["j1"], df["j2"]]).unique()
player_encoder = {player: idx for idx, player in enumerate(all_players)}
df["j1_enc"] = df["j1"].map(player_encoder)
df["j2_enc"] = df["j2"].map(player_encoder)

df["%_1er_Service_j1"] = df["%_1er_Service_j1"].str.replace('%', '').astype(float) / 100
df["%_1er_Service_j2"] = df["%_1er_Service_j2"].str.replace('%', '').astype(float) / 100
import re
import numpy as np
import pandas as pd

def extract_stats(x):
    if isinstance(x, str):
        match = re.match(r"\(?(\d+)\s*/\s*(\d+)\)?", x)
        if match:
            num = int(match.group(1))
            denom = int(match.group(2))
            ratio = num / denom if denom != 0 else np.nan
            return pd.Series([num, ratio])
    return pd.Series([np.nan, np.nan])

df[["Nb_Jeux_Gagnés_j1", "Ratio_Jeux_Gagnés_j1"]] = df["Jeux_de_Serv._Gagnés_j1"].apply(extract_stats)
df[["Nb_Jeux_Gagnés_j2", "Ratio_Jeux_Gagnés_j2"]] = df["Jeux_de_Serv._Gagnés_j2"].apply(extract_stats)

df


Unnamed: 0,href,j1,j2,time,score_j1,score_j2,date,tour,surface,Doubles_fautes_j1,%_1er_Service_j1,Jeux_de_Serv._Gagnés_j1,Doubles_fautes_j2,%_1er_Service_j2,Jeux_de_Serv._Gagnés_j2,rank1,rank2,age1,age2,point1,point2,tournament,%_1er_Service_j1_perc,%_1er_Service_j2_perc,winner,elo_j1,elo_j2,surface_encoded,tour_encoded,target,tournament_enc,elo_j1_surface,elo_j2_surface,elo_j1_tourn,elo_j2_tourn,gain_j1,gain_j2,j1_enc,j2_enc,Nb_Jeux_Gagnés_j1,Ratio_Jeux_Gagnés_j1,Nb_Jeux_Gagnés_j2,Ratio_Jeux_Gagnés_j2
0,https://www.flashscore.fr/match/tennis/f5tUgyP...,Gambill Jan Michael,Kratochvil Michel,,2,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,901.0,70.0,26.116996,23.0,0.0,580.0,auckland,,,Gambill Jan Michael,1500.000000,1500.000000,1,1,1,0,1500.000000,1500.000000,1500.000000,1500.000000,16.000000,-16.000000,0,54,,,,
1,https://www.flashscore.fr/match/tennis/IyDIHaY...,Chela Juan Ignacio,Costa Albert,,2,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,23.0,8.0,23.000000,27.0,1240.0,2090.0,sydney,,,Chela Juan Ignacio,1500.000000,1500.000000,1,1,1,1,1500.000000,1500.000000,1500.000000,1500.000000,16.000000,-16.000000,1,70,,,,
2,https://www.flashscore.fr/match/tennis/d0Z6liB...,Srichaphan Paradorn,Kucera Karol,,2,0.0,2003-01-06,FINALE,DUR,,,,,,,14.0,75.0,23.000000,28.0,1701.0,528.0,chennai,,,Srichaphan Paradorn,1516.000000,1500.000000,1,7,1,4,1516.000000,1500.000000,1500.000000,1500.000000,15.263693,-15.263693,2,56,,,,
3,https://www.flashscore.fr/match/tennis/dp3hBcB...,Ferrero Juan Carlos,Crabb Jaymon,,2,1.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,4.0,217.0,22.000000,24.0,2740.0,144.0,sydney,,,Ferrero Juan Carlos,1500.000000,1500.000000,1,1,1,1,1500.000000,1500.000000,1500.000000,1500.000000,16.000000,-16.000000,3,71,,,,
4,https://www.flashscore.fr/match/tennis/zy2dAHQ...,Fish Mardy,Krajicek Richard,,2,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,83.0,92.0,21.000000,31.0,481.0,425.0,sydney,,,Fish Mardy,1500.000000,1500.000000,1,1,1,1,1500.000000,1500.000000,1500.000000,1500.000000,16.000000,-16.000000,4,53,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73190,https://www.flashscore.fr/match/tennis/phWkfwl...,Fritz Taylor,Walton Adam,95.0,2,0.0,2025-03-26,1/8 DE FINALE,DUR,,,,,,,901.0,89.0,26.116996,25.0,0.0,666.0,miami,,,Fritz Taylor,1963.742798,1619.617605,1,4,1,19,1891.432835,1588.156022,1580.243905,1546.973825,3.878984,-3.878984,1614,2170,,,,
73191,https://www.flashscore.fr/match/tennis/Qqez08K...,Machac Tomas,Mensik Jakub,,-,,2025-03-26,1/8 DE FINALE,DUR,,,,,,,21.0,54.0,24.000000,19.0,2310.0,1042.0,miami,,,Mensik Jakub,1971.470888,1863.987126,1,4,0,19,1922.496290,1809.830750,1580.181492,1552.774802,-20.797722,20.797722,1959,2054,,,,
73192,https://www.flashscore.fr/match/tennis/hpjLmie...,Zverev Alexander,Arthur Fils,120.0,1,2.0,2025-03-26,1/8 DE FINALE,DUR,0.0,0.69,(11/14),2.0,0.68,(12/14),2.0,18.0,27.000000,20.0,7945.0,2480.0,miami,69.0,68.0,Arthur Fils,1987.291008,1863.996674,1,4,0,19,1969.080642,1787.065875,1650.705851,1520.757984,-21.450993,21.450993,1345,2027,11.0,0.785714,12.0,0.857143
73193,https://www.flashscore.fr/match/tennis/xIyRBlU...,Cerundolo Francisco,Dimitrov Grigor,170.0,1,2.0,2025-03-26,QUARTS DE FINALE,DUR,1.0,0.63,(14/17),3.0,0.61,(15/17),24.0,15.0,26.000000,33.0,1925.0,2745.0,miami,63.0,61.0,Dimitrov Grigor,1877.160675,1945.924840,1,3,0,19,1753.871053,1898.571384,1606.642521,1633.100244,-12.874004,12.874004,1893,596,14.0,0.823529,15.0,0.882353


In [4]:
df['target'].value_counts()

target
1    36598
0    36597
Name: count, dtype: int64

In [5]:






# On suppose que df est déjà chargé et préparé avec les colonnes de base :
# date, j1, j2, target (1 si j1 gagne, 0 sinon), etc.

# Liste des fenêtres que l'on souhaite pour calculer le rolling winrate
import pandas as pd
import numpy as np

# Liste des fenêtres des rolling winrates
windows = [3, 5, 10, 25, 50]

def compute_history_features_with_streak(df, windows):
    # Initialisation comme avant
    j1_winrate_features = {f'j1_winrate_{w}m': [] for w in windows}
    j2_winrate_features = {f'j2_winrate_{w}m': [] for w in windows}
    
    j1_dfaults_features = {f'j1_dfaults_avg_{w}m': [] for w in windows}
    j2_dfaults_features = {f'j2_dfaults_avg_{w}m': [] for w in windows}

    j1_nb_prev = []
    j2_nb_prev = []
    j1_streak = []
    j2_streak = []

    history = {}
    double_faults_hist = {}  # nouveau dictionnaire


    def get_streak(hist):
        if not hist:
            return 0
        streak = 0
        last_result = hist[-1]
        for r in reversed(hist):
            if r == last_result:
                streak += 1
            else:
                break
        return streak if last_result == 1 else -streak

    for idx, row in df.iterrows():
        player1 = row["j1"]
        player2 = row["j2"]

        hist1 = history.get(player1, [])
        hist2 = history.get(player2, [])

        dfault1_hist = double_faults_hist.get(player1, [])
        dfault2_hist = double_faults_hist.get(player2, [])

        nb_prev1 = len(hist1)
        nb_prev2 = len(hist2)
        j1_nb_prev.append(nb_prev1)
        j2_nb_prev.append(nb_prev2)

        for w in windows:
            # Winrate
            winrate1 = np.mean(hist1[-w:]) if hist1 else -1
            winrate2 = np.mean(hist2[-w:]) if hist2 else -1
            j1_winrate_features[f'j1_winrate_{w}m'].append(winrate1)
            j2_winrate_features[f'j2_winrate_{w}m'].append(winrate2)

            # Moyenne des double fautes (en excluant les -1)
            def compute_dfault_avg(hist):
                valid = [x for x in hist[-w:] if x != -1]
                return np.mean(valid) if valid else -1  # ou np.nan selon préférence

            j1_dfaults_features[f'j1_dfaults_avg_{w}m'].append(compute_dfault_avg(dfault1_hist))
            j2_dfaults_features[f'j2_dfaults_avg_{w}m'].append(compute_dfault_avg(dfault2_hist))

        current_streak1 = get_streak(hist1)
        current_streak2 = get_streak(hist2)
        j1_streak.append(current_streak1)
        j2_streak.append(current_streak2)

        result_j1 = row["target"]
        result_j2 = 1 - result_j1
        history[player1] = hist1 + [result_j1]
        history[player2] = hist2 + [result_j2]

        # Mettre à jour les historiques de double fautes
        df1 = row.get("Double_Fautes_j1", -1)
        df2 = row.get("Double_Fautes_j2", -1)
        double_faults_hist[player1] = dfault1_hist + [df1]
        double_faults_hist[player2] = dfault2_hist + [df2]

    # Ajout des features
    for w in windows:
        df[f'j1_winrate_{w}m'] = j1_winrate_features[f'j1_winrate_{w}m']
        df[f'j2_winrate_{w}m'] = j2_winrate_features[f'j2_winrate_{w}m']
        df[f'j1_dfaults_avg_{w}m'] = j1_dfaults_features[f'j1_dfaults_avg_{w}m']
        df[f'j2_dfaults_avg_{w}m'] = j2_dfaults_features[f'j2_dfaults_avg_{w}m']

    df['j1_nb_prev_matches'] = j1_nb_prev
    df['j2_nb_prev_matches'] = j2_nb_prev
    df['j1_streak'] = j1_streak
    df['j2_streak'] = j2_streak

    return df


# Exemple d'utilisation

# Conversion de la colonne "date" en datetime et tri par date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

# Appliquer la fonction pour ajouter les features de forme (winrate, nb_prev_matches et streak)
df = compute_history_features_with_streak(df, windows)

# Affichage des colonnes d'intérêt pour vérifier

def compute_h2h_features(df):
    h2h_counts = {}
    h2h_winrate_j1 = []
    h2h_total_prev = []  # Liste pour le nombre de matchs précédents pour le matchup

    # Parcourir chaque match dans l'ordre chronologique
    # (Assurez-vous que df est trié par date avant d'appeler cette fonction)
    for idx, row in df.iterrows():
        p1, p2 = row["j1"], row["j2"]
        # Créer une clé unique et ordonnée pour la confrontation
        matchup = tuple(sorted([p1, p2]))
        
        # Initialisation si le matchup n'existe pas encore
        if matchup not in h2h_counts:
            h2h_counts[matchup] = {"total": 0, p1: 0, p2: 0}

        # Récupérer le nombre de matchs déjà joués entre ces deux joueurs
        total = h2h_counts[matchup]["total"]
        # Stocker ce total avant la mise à jour pour ce match
        h2h_total_prev.append(total)
        
        # Récupérer le nombre de victoires de j1 dans ces confrontations
        p1_wins = h2h_counts[matchup][p1]
        # Calculer le winrate précédent de j1 (avant ce match)
        winrate = p1_wins / total if total > 0 else np.nan
        h2h_winrate_j1.append(winrate)

        # Mise à jour après le match courant
        winner = row["winner"]
        h2h_counts[matchup]["total"] += 1
        if winner not in h2h_counts[matchup]:
            raise ValueError(f"Le nom '{winner}' du winner n'est pas dans le matchup {matchup}")
        h2h_counts[matchup][winner] += 1

    # Ajout des features dans le DataFrame
    df["h2h_winrate_j1"] = h2h_winrate_j1
    df["h2h_total_prev"] = h2h_total_prev

    return df



df = compute_h2h_features(df)
df["score_j1"].replace('-',"0",inplace=True)
df["score_j2"].replace('-',"0",inplace=True)
df["score_j1"] = pd.to_numeric(df["score_j1"], errors="coerce")
df["score_j2"] = pd.to_numeric(df["score_j2"], errors="coerce")
df["set_diff"] = df["score_j1"] - df["score_j2"]
df["h2h_winrate_j1"].fillna(-1,inplace=True)
def best_of(df):
    best = []
    for idx, row in df.iterrows():
        s1, s2 = float(row["score_j1"]), float(row["score_j2"])
        if max(s1,s2) == 2:
            best.append(3)
        elif max(s1,s2) == 3:
            best.append(5)
        elif max(s1,s2) == 4:
            best.append(7)
        else:
            best.append(-1)
    df["best_of"] = best
    return df


df = best_of(df)
df.drop(index=df[df["best_of"] ==-1].index, inplace=True)
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = df.copy()

# Colonnes par paires
paired_cols = [
    ('age1', 'age2'),
    ('point1', 'point2'),
    ('elo_j1', 'elo_j2'),
    ('elo_j1_surface', 'elo_j2_surface'),
    ('elo_j1_tourn', 'elo_j2_tourn'),
    ('rank1', 'rank2'),
    ('gain_j1','gain_j2'),
    ("j1_nb_prev_matches","j2_nb_prev_matches")
]

# Normalisation paire par paire
for col1, col2 in paired_cols:
    stacked = np.hstack([df[col1].values.reshape(-1, 1), df[col2].values.reshape(-1, 1)])
    scaler = StandardScaler()
    scaled = scaler.fit_transform(stacked)
    df[col1] = scaled[:, 0]
    df[col2] = scaled[:, 1]

df['diff_rank'] = df['rank1'] - df['rank2']
df['diff_elo']  = df['elo_j1'] - df['elo_j2']
df['diff_age']  = df['age1'] - df['age2']
df['diff_points'] = df['point1'] - df['point2']
df['diff_streak'] = df['j1_streak'] - df['j2_streak']
df['diff_elo_surf']  = df['elo_j1_surface'] - df['elo_j2_surface']
df['diff_elo_tourn']  = df['elo_j1_tourn'] - df['elo_j2_tourn']
# Vous pouvez aussi ajouter d'autres ratios ou interactions
df['ratio_rank'] = df['rank1'] / (df['rank2'] + 1e-6)  # évite la division par zéro


df



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["score_j1"].replace('-',"0",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["score_j2"].replace('-',"0",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Unnamed: 0,href,j1,j2,time,score_j1,score_j2,date,tour,surface,Doubles_fautes_j1,%_1er_Service_j1,Jeux_de_Serv._Gagnés_j1,Doubles_fautes_j2,%_1er_Service_j2,Jeux_de_Serv._Gagnés_j2,rank1,rank2,age1,age2,point1,point2,tournament,%_1er_Service_j1_perc,%_1er_Service_j2_perc,winner,elo_j1,elo_j2,surface_encoded,tour_encoded,target,tournament_enc,elo_j1_surface,elo_j2_surface,elo_j1_tourn,elo_j2_tourn,gain_j1,gain_j2,j1_enc,j2_enc,Nb_Jeux_Gagnés_j1,Ratio_Jeux_Gagnés_j1,Nb_Jeux_Gagnés_j2,Ratio_Jeux_Gagnés_j2,j1_winrate_3m,j2_winrate_3m,j1_dfaults_avg_3m,j2_dfaults_avg_3m,j1_winrate_5m,j2_winrate_5m,j1_dfaults_avg_5m,j2_dfaults_avg_5m,j1_winrate_10m,j2_winrate_10m,j1_dfaults_avg_10m,j2_dfaults_avg_10m,j1_winrate_25m,j2_winrate_25m,j1_dfaults_avg_25m,j2_dfaults_avg_25m,j1_winrate_50m,j2_winrate_50m,j1_dfaults_avg_50m,j2_dfaults_avg_50m,j1_nb_prev_matches,j2_nb_prev_matches,j1_streak,j2_streak,h2h_winrate_j1,h2h_total_prev,set_diff,best_of,diff_rank,diff_elo,diff_age,diff_points,diff_streak,diff_elo_surf,diff_elo_tourn,ratio_rank
0,https://www.flashscore.fr/match/tennis/f5tUgyP...,Gambill Jan Michael,Kratochvil Michel,,2.0,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,2.664051,-0.425207,-0.014184,-0.810997,-0.647076,-0.324446,auckland,,,Gambill Jan Michael,-0.996602,-0.994111,1,1,1,0,-0.814535,-0.817937,-0.475487,-0.477416,1.107542,-1.107542,0,54,,,,,-1.000000,-1.000000,-1,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1.00,-1.00,-1,-1,-1.00,-1.000000,-1,-1,-0.943799,-0.941649,0,0,-1.000,0,2.0,3,3.089259,-0.002490,0.796814,-0.322630,0,0.003402,0.001929,-6.265312
1,https://www.flashscore.fr/match/tennis/CtyZhev...,Ginepri Robby,Calleri Agustin,,2.0,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,-0.295991,-0.500113,-1.607876,-0.033530,-0.425828,-0.220233,auckland,,,Ginepri Robby,-0.996602,-0.994111,1,1,1,0,-0.814535,-0.817937,-0.475487,-0.477416,1.107542,-1.107542,20,110,,,,,-1.000000,-1.000000,-1,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1.00,-1.00,-1,-1,-1.00,-1.000000,-1,-1,-0.943799,-0.941649,0,0,-1.000,0,2.0,3,0.204122,-0.002490,-1.574346,-0.205595,0,0.003402,0.001929,0.591849
2,https://www.flashscore.fr/match/tennis/29lkCwd...,Ferreira Wayne,Enqvist Thomas,,2.0,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,-0.521976,-0.515094,1.258010,0.484782,-0.153219,-0.188969,sydney,,,Ferreira Wayne,-0.996602,-0.994111,1,1,1,1,-0.814535,-0.817937,-0.475487,-0.477416,1.107542,-1.107542,21,48,,,,,-1.000000,-1.000000,-1,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1.00,-1.00,-1,-1,-1.00,-1.000000,-1,-1,-0.943799,-0.941649,0,0,-1.000,0,2.0,3,-0.006882,-0.002490,0.773228,0.035750,0,0.003402,0.001929,1.013363
3,https://www.flashscore.fr/match/tennis/OIkoDJt...,Davydenko Nikolay,Durek Raphael,,2.0,0.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,-0.447883,2.687111,-1.347341,-0.003210,-0.287548,-0.727402,sydney,,,Davydenko Nikolay,-0.996602,-0.994111,1,1,1,1,-0.814535,-0.817937,-0.475487,-0.477416,1.107542,-1.107542,22,2290,,,,,-1.000000,-1.000000,-1,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1.00,-1.00,-1,-1,-1.00,-1.000000,-1,-1,-0.943799,-0.941649,0,0,-1.000,0,2.0,3,-3.134993,-0.002490,-1.344131,0.439853,0,0.003402,0.001929,-0.166678
4,https://www.flashscore.fr/match/tennis/MirMecf...,Coria Guillermo,Sanchez Munoz David,,2.0,1.0,2003-01-06,1/16 DE FINALE,DUR,,,,,,,-0.507157,2.687111,-1.607876,-0.003210,-0.207967,-0.727402,auckland,,,Coria Guillermo,-0.996602,-0.994111,1,1,1,0,-0.814535,-0.817937,-0.475487,-0.477416,1.107542,-1.107542,23,90,,,,,-1.000000,-1.000000,-1,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1.00,-1.00,-1,-1,-1.00,-1.000000,-1,-1,-0.943799,-0.941649,0,0,-1.000,0,1.0,3,-3.194268,-0.002490,-1.604666,0.519435,0,0.003402,0.001929,-0.188737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73189,https://www.flashscore.fr/match/tennis/OhGugwd...,Musetti Lorenzo,Djokovic Novak,83.0,0.0,2.0,2025-03-25,1/8 DE FINALE,DUR,3.0,0.70,(3/8),1.0,0.70,(7/8),-0.614593,-0.668650,-0.826270,2.817185,0.848604,1.954339,miami,70.0,70.0,Djokovic Novak,1.206802,2.863781,1,4,0,19,0.909823,3.558285,0.281546,5.806314,-0.351965,0.351965,1903,388,3.0,0.375000,7.0,0.875000,0.666667,0.666667,-1,-1,0.6,0.6,-1,-1,0.6,0.7,-1,-1,0.60,0.80,-1,-1,0.68,0.760000,-1,-1,0.296305,5.328785,2,2,0.125,8,-2.0,3,0.054057,-1.656979,-3.643455,-1.105735,0,-2.648462,-5.524768,0.919157
73190,https://www.flashscore.fr/match/tennis/hpjLmie...,Zverev Alexander,Arthur Fils,120.0,1.0,2.0,2025-03-26,1/8 DE FINALE,DUR,0.0,0.69,(11/14),2.0,0.68,(12/14),-0.666459,-0.619961,0.215870,-1.588465,3.837141,0.995582,miami,69.0,68.0,Arthur Fils,1.767352,1.137503,1,4,0,19,2.263737,1.141871,2.159815,-0.101596,-1.425960,1.425960,1345,2027,11.0,0.785714,12.0,0.857143,0.666667,0.666667,-1,-1,0.6,0.8,-1,-1,0.6,0.6,-1,-1,0.72,0.68,-1,-1,0.72,0.600000,-1,-1,2.237983,-0.306030,2,2,0.750,4,-1.0,3,-0.046497,0.629849,1.804335,2.841559,0,1.121865,2.261411,1.075002
73191,https://www.flashscore.fr/match/tennis/xIyRBlU...,Cerundolo Francisco,Dimitrov Grigor,170.0,1.0,2.0,2025-03-26,QUARTS DE FINALE,DUR,1.0,0.63,(14/17),3.0,0.61,(15/17),-0.584956,-0.631197,-0.044665,1.780561,0.439408,1.179691,miami,63.0,61.0,Dimitrov Grigor,1.142684,1.617285,1,3,0,19,0.851456,1.903123,1.389306,1.932345,-0.845740,0.845740,1893,596,14.0,0.823529,15.0,0.882353,1.000000,1.000000,-1,-1,0.8,0.8,-1,-1,0.8,0.5,-1,-1,0.68,0.64,-1,-1,0.62,0.660000,-1,-1,0.103620,2.887200,3,3,0.000,1,-1.0,3,0.046241,-0.474602,-1.825226,-0.740283,0,-1.051667,-0.543039,0.926742
73192,https://www.flashscore.fr/match/tennis/phWkfwl...,Fritz Taylor,Walton Adam,95.0,2.0,0.0,2025-03-26,1/8 DE FINALE,DUR,,,,,,,2.664051,-0.354047,-0.014184,-0.292686,-0.647076,-0.264697,miami,,,Fritz Taylor,1.633784,-0.293614,1,4,1,19,1.754184,-0.216093,0.927690,0.373038,0.287574,-0.287574,1614,2170,,,,,0.666667,0.666667,-1,-1,0.8,0.6,-1,-1,0.6,0.7,-1,-1,0.64,0.44,-1,-1,0.70,0.466667,-1,-1,1.417834,-0.714642,2,2,-1.000,0,2.0,3,3.018099,1.927399,0.278502,-0.382379,0,1.970277,0.554651,-7.524583


In [6]:
import torch
from torch_geometric.data import TemporalData
from torch_geometric.loader import TemporalDataLoader
cols_to_drop = [
    "surface", "href", "j1", "j2", "time", "score_j1", "score_j2", "tour",
    "Doubles_fautes_j1", "%_1er_Service_j1", "Jeux_de_Serv._Gagnés_j1",
    "Doubles_fautes_j2", "%_1er_Service_j2", "Jeux_de_Serv._Gagnés_j2",
    "tournament", "%_1er_Service_j1_perc", "%_1er_Service_j2_perc", "winner"
]

# option 1 – get a new DataFrame
df_clean = df.drop(columns=cols_to_drop)
df["timestamp"] = pd.to_datetime(df["date"]).astype(int) // 10**9
print(len(df[df["j1"]==df["j2"]]))



# 3. Définir les colonnes utilisées comme features (msg)
features_cols = [
    "diff_rank", "diff_elo", "diff_age", "diff_points", "diff_elo_surf","diff_streak","surface_encoded","tour_encoded","tournament_enc",
    "h2h_winrate_j1", "h2h_total_prev", "best_of",
]
# 1) Définissez un mapping entre les noms “1” et les noms “2”
swap_map = {
    'rank1': 'rank2', 'rank2': 'rank1',
    "j1_enc" : "j2_enc","j2_enc" : "j1_enc",
    'age1': 'age2',   'age2': 'age1',
    'point1': 'point2','point2': 'point1',
    'elo_j1': 'elo_j2', 'elo_j2': 'elo_j1',
    'elo_j1_surface': 'elo_j2_surface','elo_j2_surface': 'elo_j1_surface',
    'elo_j1_tourn': 'elo_j2_tourn','elo_j2_tourn': 'elo_j1_tourn',
    'j1_winrate_3m':'j2_winrate_3m','j2_winrate_3m':'j1_winrate_3m',
    'j1_winrate_5m':'j2_winrate_5m','j2_winrate_5m':'j1_winrate_5m',
    'j1_winrate_10m':'j2_winrate_10m','j2_winrate_10m':'j1_winrate_10m',
    'j1_winrate_25m':'j2_winrate_25m','j2_winrate_25m':'j1_winrate_25m',
    'j1_winrate_50m':'j2_winrate_50m','j2_winrate_50m':'j1_winrate_50m',
    'j1_nb_prev_matches':'j2_nb_prev_matches','j2_nb_prev_matches':'j1_nb_prev_matches',
    'j1_streak':'j2_streak','j2_streak':'j1_streak',
}

# 2) Renommez vos colonnes d'après ce mapping
df_swapped = df.rename(columns=swap_map).copy()
df_swapped['target'] = 1 - df_swapped['target']
df_swapped['h2h_winrate_j1'] = 1- df_swapped['h2h_winrate_j1']
df_swapped["diff_elo"] = df_swapped["diff_elo"]*(-1)
df_swapped["diff_age"] = df_swapped["diff_age"]*(-1)
df_swapped["diff_points"] = df_swapped["diff_points"]*(-1)
df_swapped["diff_elo_surf"] = df_swapped["diff_elo_surf"]*(-1)
df_swapped["diff_elo_tourn"] = df_swapped["diff_elo_tourn"]*(-1)
df_swapped["diff_rank"] = df_swapped["diff_rank"]*(-1)
df_swapped["diff_streak"] = df_swapped["diff_streak"]*(-1)
df_swapped["set_diff"] = df_swapped["set_diff"]*(-1)
df_swapped["ratio_rank"] = 1/df_swapped["ratio_rank"]
df_swapped["gain_j1"] = df_swapped["gain_j1"]*(-1)
df_swapped

df_aug = pd.concat([df, df_swapped], ignore_index=True)
df_aug = df_aug.sort_values('date').reset_index(drop=True)




df[features_cols]



0


Unnamed: 0,diff_rank,diff_elo,diff_age,diff_points,diff_elo_surf,diff_streak,surface_encoded,tour_encoded,tournament_enc,h2h_winrate_j1,h2h_total_prev,best_of
0,3.089259,-0.002490,0.796814,-0.322630,0.003402,0,1,1,0,-1.000,0,3
1,0.204122,-0.002490,-1.574346,-0.205595,0.003402,0,1,1,0,-1.000,0,3
2,-0.006882,-0.002490,0.773228,0.035750,0.003402,0,1,1,1,-1.000,0,3
3,-3.134993,-0.002490,-1.344131,0.439853,0.003402,0,1,1,1,-1.000,0,3
4,-3.194268,-0.002490,-1.604666,0.519435,0.003402,0,1,1,0,-1.000,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
73189,0.054057,-1.656979,-3.643455,-1.105735,-2.648462,0,1,4,19,0.125,8,3
73190,-0.046497,0.629849,1.804335,2.841559,1.121865,0,1,4,19,0.750,4,3
73191,0.046241,-0.474602,-1.825226,-0.740283,-1.051667,0,1,3,19,0.000,1,3
73192,3.018099,1.927399,0.278502,-0.382379,1.970277,0,1,4,19,-1.000,0,3


In [7]:
df_swapped["h2h_winrate_j1"][df_swapped["h2h_winrate_j1"] == 2] = -1
df_swapped[features_cols]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_swapped["h2h_winrate_j1"][df_swapped["h2h_winrate_j1"] == 2] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

Unnamed: 0,diff_rank,diff_elo,diff_age,diff_points,diff_elo_surf,diff_streak,surface_encoded,tour_encoded,tournament_enc,h2h_winrate_j1,h2h_total_prev,best_of
0,-3.089259,0.002490,-0.796814,0.322630,-0.003402,0,1,1,0,-1.000,0,3
1,-0.204122,0.002490,1.574346,0.205595,-0.003402,0,1,1,0,-1.000,0,3
2,0.006882,0.002490,-0.773228,-0.035750,-0.003402,0,1,1,1,-1.000,0,3
3,3.134993,0.002490,1.344131,-0.439853,-0.003402,0,1,1,1,-1.000,0,3
4,3.194268,0.002490,1.604666,-0.519435,-0.003402,0,1,1,0,-1.000,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
73189,-0.054057,1.656979,3.643455,1.105735,2.648462,0,1,4,19,0.875,8,3
73190,0.046497,-0.629849,-1.804335,-2.841559,-1.121865,0,1,4,19,0.250,4,3
73191,-0.046241,0.474602,1.825226,0.740283,1.051667,0,1,3,19,1.000,1,3
73192,-3.018099,-1.927399,-0.278502,0.382379,-1.970277,0,1,4,19,-1.000,0,3


In [8]:
from torch_geometric.data import TemporalData
from collections import defaultdict
from datetime import datetime

import torch
from collections import defaultdict
from datetime import datetime
from torch_geometric.data import TemporalData

class DateGroupedTemporalLoader:
    def __init__(self, data_list, timestamp_attr='t'):
        self.grouped = defaultdict(list)

        for data in data_list:
            ts = float(getattr(data, timestamp_attr).item())
            date = datetime.fromtimestamp(ts).date().isoformat()
            self.grouped[date].append(data)

        self.sorted_dates = sorted(self.grouped.keys())

    def __iter__(self):
        for date in self.sorted_dates:
            batch_list = self.grouped[date]

            # Concatène tous les attributs nécessaires
            src = torch.cat([d.src for d in batch_list], dim=0)
            dst = torch.cat([d.dst for d in batch_list], dim=0)
            t   = torch.cat([d.t for d in batch_list], dim=0)
            msg = torch.cat([d.msg for d in batch_list], dim=0)
            y   = torch.cat([d.y for d in batch_list], dim=0)
            set_diff = torch.cat([d.set_diff for d in batch_list], dim=0)
            elo_gain = torch.cat([d.elo_gain for d in batch_list], dim=0)
            batch = TemporalData(src=src, dst=dst, t=t, msg=msg, y=y,set_diff = set_diff,elo_gain=elo_gain)
            batch.date = date  # Ajout facultatif pour le suivi
            yield batch

    def __len__(self):
        return len(self.sorted_dates)


import torch
from torch_geometric.data import TemporalData

class UniquePlayerBatchLoader:
    def __init__(self, data_list, timestamp_attr='t'):
        # Tri des événements par timestamp croissant
        self.data_list = sorted(data_list, key=lambda d: float(getattr(d, timestamp_attr).item()))

    def __iter__(self):
        seen_players = set()
        batch_events = []

        for data in self.data_list:
            src = int(data.src.item())
            dst = int(data.dst.item())

            # Si ni src ni dst n'ont déjà été vus, on ajoute
            if src not in seen_players and dst not in seen_players:
                batch_events.append(data)
                seen_players.update([src, dst])
            else:
                # Sinon, on yield le batch courant, on réinitialise, et on reprend avec ce data
                if batch_events:
                    yield self._make_batch(batch_events)
                batch_events = [data]
                seen_players = {src, dst}

        # N'oublie pas de yield le dernier petit batch
        if batch_events:
            yield self._make_batch(batch_events)

    def __len__(self):
        # Impossible de connaître à l'avance le nombre exact de batches sans exécuter l'itérateur
        raise NotImplementedError("La longueur exacte dépend de la structure des données et doit être calculée à la volée.")

    def _make_batch(self, event_list):
        # Concatène manuellement les champs en un TemporalData
        src = torch.cat([d.src for d in event_list], dim=0)
        dst = torch.cat([d.dst for d in event_list], dim=0)
        t   = torch.cat([d.t   for d in event_list], dim=0)
        msg = torch.cat([d.msg for d in event_list], dim=0)
        y   = torch.cat([d.y   for d in event_list], dim=0)
        set_diff = torch.cat([d.set_diff for d in event_list], dim=0)
        elo_gain = torch.cat([d.elo_gain for d in event_list], dim=0)
        batch = TemporalData(src=src, dst=dst, t=t, msg=msg, y=y,set_diff = set_diff,elo_gain=elo_gain)
        return batch

# 4. Split temporel
train_df = df[df["date"] < "2024-01-01"].copy()
train_df2 = df_swapped[df_swapped["date"] < "2024-01-01"].copy()
test_df  = df[df["date"] >= "2024-01-01"].copy()
test_df2  = df_swapped[df_swapped["date"] >= "2024-01-01"].copy()
print(len(train_df))
print(len(test_df))
# Concaténer j1_enc et j2_enc, puis compter les occurrences



# 5. Création des TemporalData
def build_temporal_data(df_subset):
    return TemporalData(
        src = torch.tensor(df_subset["j1_enc"].values, dtype=torch.long),
        dst = torch.tensor(df_subset["j2_enc"].values, dtype=torch.long),
        t   = torch.tensor(df_subset["timestamp"].values, dtype=torch.long),
        msg = torch.tensor(df_subset[features_cols].values, dtype=torch.float),
        y   = torch.tensor(df_subset["target"].values, dtype=torch.float),
        set_diff  = torch.tensor(df_subset["set_diff"].values,  dtype=torch.float),
        elo_gain  = torch.tensor(df_subset["gain_j1"].values,  dtype=torch.float),
    )
full_data = build_temporal_data(df)
full_data2 = build_temporal_data(df_swapped)
train_data = build_temporal_data(train_df)
train_data2 = build_temporal_data(train_df2)
test_data  = build_temporal_data(test_df)
test_data2  = build_temporal_data(test_df2)


for data in (full_data,full_data2,train_data,train_data2, test_data,test_data2):
    data.src = data.src.long()    # src en ints longs
    data.dst = data.dst.long()    # dst en ints longs
    data.t   = data.t.long()     # timestamps en floats
    data.msg = data.msg.float()   # features en floats
    data.y   = data.y.float()     # labels en floats
    data.set_diff = data.set_diff.float()
    data.elo_gain = data.elo_gain.float()
# 4) Déplacer t et msg sur GPU
device = "cuda"
for data in (full_data,full_data2,train_data,train_data2, test_data,test_data2):
    data.t   = data.t.to(device)
    data.msg = data.msg.to(device)
# 6. DataLoaders pour entraînement
train_loader = TemporalDataLoader(train_data, batch_size=32, neg_sampling_ratio=0)
test_loader  = TemporalDataLoader(test_data, batch_size=32, neg_sampling_ratio=0)
train_loader2 = TemporalDataLoader(train_data2, batch_size=32, neg_sampling_ratio=0)
test_loader2  = TemporalDataLoader(test_data2, batch_size=32, neg_sampling_ratio=0)

67349
4461


In [9]:

total_events = 0
num_batches = 0

for batch in train_loader:
    print("=== Nouveau batch ===")

    print("src:", batch.src)
    print("dst:", batch.dst)
    print("t:", batch.t)
    print("msg:", batch.msg)
    print("y:", batch.y)
    
    # Si tu as d'autres attributs comme "set_diff", "elo_gain", etc.
    if hasattr(batch, 'set_diff'):
        print("set_diff:", batch.set_diff)
    if hasattr(batch, 'elo_gain'):
        print("elo_gain:", batch.elo_gain)
    if hasattr(batch, 'n_id'):
        print("n_id:", batch.n_id)

    break  # Supprime ce break si tu veux afficher tous les batches

for batch in train_loader2:
    print("=== Nouveau batch ===")

    print("src:", batch.src)
    print("dst:", batch.dst)
    print("t:", batch.t)
    print("msg:", batch.msg)
    print("y:", batch.y)
    
    # Si tu as d'autres attributs comme "set_diff", "elo_gain", etc.
    if hasattr(batch, 'set_diff'):
        print("set_diff:", batch.set_diff)
    if hasattr(batch, 'elo_gain'):
        print("elo_gain:", batch.elo_gain)
    if hasattr(batch, 'n_id'):
        print("n_id:", batch.n_id)

    break  # Supprime ce break si tu veux afficher tous les batches



avg_batch_size = total_events / num_batches if num_batches > 0 else 0
print(f"Moyenne des batchs : {avg_batch_size:.2f} événements par date")


=== Nouveau batch ===
src: tensor([ 0, 20, 21, 22, 23, 24, 25, 19,  0,  2, 27, 28, 29, 31, 26, 18, 22, 16,
         1,  2,  3,  5,  6,  7,  8,  4, 10, 11, 12, 13, 14, 15])
dst: tensor([  54,  110,   48, 2290,   90,  140,   76,   91,   31,   66,   77,   78,
          49,  263,   98,  262,  213,   95,   70,   56,   71,   63,   89,  104,
         141,   53,   74,   50,   69,  123,  179,   96])
t: tensor([1041811200, 1041811200, 1041811200, 1041811200, 1041811200, 1041811200,
        1041811200, 1041811200, 1041811200, 1041811200, 1041811200, 1041811200,
        1041811200, 1041811200, 1041811200, 1041811200, 1041811200, 1041811200,
        1041811200, 1041811200, 1041811200, 1041811200, 1041811200, 1041811200,
        1041811200, 1041811200, 1041811200, 1041811200, 1041811200, 1041811200,
        1041811200, 1041811200], device='cuda:0')
msg: tensor([[ 3.0893e+00, -2.4904e-03,  7.9681e-01, -3.2263e-01,  3.4020e-03,
          0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00, -1.0000e+00,


In [None]:
import matplotlib.pyplot as plt

from torch_geometric.nn import TGNMemory
from torch_geometric.nn.models.tgn import (
    LastAggregator,
    LastNeighborLoader,
    IdentityMessage
)
from tgn.model import MultiLayerTimeAwareGNN,MessageMLP,WinPredictorMLP,WinPredictor
from tgn.utils import train,evaluate,compute_alpha,train_debug
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Paramètres
memory_dim = 32
time_dim   = 32
embedding_dim = 256
in_channels = 256
hidden_channels = 32
num_layers = 3
heads = 8
dropout= 0.2
hidden_dim = 32
learning_rate_list = [4e-4,5e-4,6e-4,7e-4]
bestval = 0

for learning_rate in learning_rate_list:

    print(f"LR = {learning_rate}")

    num_nodes = max(df[["j1_enc", "j2_enc"]].max()) + 1
    msg_dim = full_data.msg.size(-1)

    memory = TGNMemory(
        num_nodes=num_nodes,
        raw_msg_dim=msg_dim+1,
        memory_dim=memory_dim,
        time_dim=time_dim,
        message_module=MessageMLP(msg_dim+1, memory_dim, time_dim,2*memory_dim),
        aggregator_module=LastAggregator(),
    ).to(device)

    gnn = MultiLayerTimeAwareGNN(in_channels,memory_dim,hidden_channels, embedding_dim, msg_dim, memory.time_enc,num_layers,heads,dropout).to(device)
    win_pred = WinPredictor(
        embed_dim=embedding_dim,
        hidden_dim=hidden_dim,   
        context_dim=msg_dim
    ).to(device)

    total_params = 0
    for model in [memory, gnn, win_pred]:
        model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"{model.__class__.__name__} params: {model_params:,}")
        total_params += model_params

    print(f"Total parameters: {total_params:,}")



    optimizer = torch.optim.AdamW(
        list(memory.parameters()) + list(gnn.parameters()) + list(win_pred.parameters()),
        lr=learning_rate,weight_decay=5e-4
    )
    criterion = torch.nn.BCEWithLogitsLoss()

    # === Loaders ===


    train_loader_ngh = LastNeighborLoader(num_nodes=num_nodes, size=25, device=device)
    eval_loader_ngh  = LastNeighborLoader(num_nodes=num_nodes, size=25, device=device)

    assoc = torch.empty(num_nodes, dtype=torch.long, device=device)


    # === Entraînement ===
    train_losses = []
    train_aps  = []
    val_losses = []
    val_metrics = []
    val_tresh_06 =  []
    val_tresh_065 =  []
    val_tresh_07 =  []
    val_tresh_075 =  []
    val_tresh_08 =  []
    threshold = [0.6,0.65,0.7,0.75,0.8]
    num_epochs = 100

    import random

    train_variants = [
        (train_loader, full_data, train_data),
        (train_loader2, full_data2, train_data2)
    ]
    best_val_ap = 0
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)


    for epoch in range(1, num_epochs + 1):
        alpha = compute_alpha(epoch,num_epochs)
        loader, full, train_data_split = random.choice(train_variants)
        loss,ap,prec = train_debug(loader, memory, gnn, win_pred, full, train_loader_ngh,eval_loader_ngh, optimizer, device, assoc, train_data_split, alpha)
        train_losses.append(loss)
        print(ap)
        print(prec)
        train_aps.append(ap)
        
        
        val_ap, val_loss,prec_at,well_predicted_dates, badly_predicted_dates,prec = evaluate(test_loader,memory,gnn,win_pred,full_data,eval_loader_ngh,assoc,device,threshold,alpha)
        val_metrics.append(val_ap)
        val_losses.append(val_loss)
        val_tresh_06.append(prec_at[f'Prec@{threshold[0]}'])
        val_tresh_065.append(prec_at[f'Prec@{threshold[1]}'])
        val_tresh_07.append(prec_at[f'Prec@{threshold[2]}'])
        val_tresh_075.append(prec_at[f'Prec@{threshold[3]}'])
        val_tresh_08.append(prec_at[f'Prec@{threshold[4]}'])
        if val_ap > best_val_ap:
            best_val_ap = val_ap
            if best_val_ap > bestval:
                bestval = best_val_ap
                lrate = learning_rate
            print(
            f"[epoch {epoch}]\n"
            f" Test loss: {val_loss:.4f} | Test AP: {val_ap:.4f} | | Test Prec: {prec:.4f} |\n"
            f"    Precision {threshold[0]} : {prec_at[f'Prec@{threshold[0]}']:.4f} "
            f"avec {prec_at[f'Num@{threshold[0]}']*100:.2f}% des exemples |\n"
            f"    Precision {threshold[1]} : {prec_at[f'Prec@{threshold[1]}']:.4f} "
            f"avec {prec_at[f'Num@{threshold[1]}']*100:.2f}% des exemples |\n"
            f"    Precision {threshold[2]} : {prec_at[f'Prec@{threshold[2]}']:.4f} "
            f"avec {prec_at[f'Num@{threshold[2]}']*100:.2f}% des exemples |\n"
            f"    Precision {threshold[3]} : {prec_at[f'Prec@{threshold[3]}']:.4f} "
            f"avec {prec_at[f'Num@{threshold[3]}']*100:.2f}% des exemples |\n"
            f"    Precision {threshold[4]} : {prec_at[f'Prec@{threshold[4]}']:.4f} "
            f"avec {prec_at[f'Num@{threshold[4]}']*100:.2f}% des exemples"
        )
        eval_loader_ngh.reset_state()
        if epoch > 1 :
            epochs = list(range(1, len(train_losses) + 1))
            plt.figure()
            plt.plot(epochs, train_losses, label="Train Loss")
            plt.plot(epochs, val_losses, label="Val Loss")
            plt.xlabel("Epoch")
            plt.ylabel("Loss")
            plt.title("Courbe de loss")
            plt.legend()
            plt.tight_layout()
            plt.savefig("loss.png", dpi=300)
            plt.close()

            # Accuracy / AP
            plt.figure()
            plt.plot(epochs, val_metrics,   label="Validation AP")
            plt.plot(epochs, val_tresh_06,   label="Validation AP 60%")
            plt.plot(epochs, val_tresh_065,   label="Validation AP 65%")
            plt.plot(epochs, val_tresh_07,   label="Validation AP 70%")
            plt.plot(epochs, val_tresh_075,   label="Validation AP 75%")
            plt.plot(epochs, val_tresh_08,   label="Validation AP 80%")
            plt.xlabel("Epoch")
            plt.ylabel("Average Precision")
            plt.title("Courbe d'AP")
            plt.legend()
            plt.tight_layout()
            plt.savefig("acc.png", dpi=300)
            plt.close()


            plt.figure()
            plt.plot(epochs, train_aps, label="Train AP")
            plt.plot(epochs, val_metrics,   label="Validation AP")
            plt.xlabel("Epoch")
            plt.ylabel("Average Precision")
            plt.title("Courbe d'AP")
            plt.legend()
            plt.tight_layout()
            plt.savefig("acc_diff.png", dpi=300)
            plt.close()

            well_dates = pd.to_datetime(well_predicted_dates, unit='s')
            bad_dates = pd.to_datetime(badly_predicted_dates, unit='s')

            # Créer un dataframe pour l'histogramme
            df_hist = pd.DataFrame({
                'date': pd.concat([pd.Series(well_dates), pd.Series(bad_dates)]),
                'result': ['correct'] * len(well_dates) + ['incorrect'] * len(bad_dates)
            })
            # Grouper par mois par exemple
            df_hist['month'] = df_hist['date'].dt.to_period('M')

            # Plot
            plt.figure(figsize=(12,6))
            for result, group in df_hist.groupby('result'):
                group['month'].value_counts().sort_index().plot(kind='bar', label=result, alpha=0.7)

            plt.title('Distribution des prédictions correctes et incorrectes par mois')
            plt.xlabel('Mois')
            plt.ylabel('Nombre de matchs')
            plt.legend()
            plt.tight_layout()
            plt.savefig('prediction_histogram.png')
            plt.close()


        scheduler.step()
        with torch.no_grad():
            # On récupère les embeddings mémoire actuels (shape : num_nodes x memory_dim)
            current_memory = memory.memory.clone().detach().cpu()  # shape: (num_nodes, memory_dim)
            
            # Norme L2 de chaque embedding
            norms = torch.norm(current_memory, dim=1).numpy()  # shape: (num_nodes,)

            # Statistiques utiles
            mean_norm = norms.mean()
            std_norm = norms.std()
            max_norm = norms.max()
            min_norm = norms.min()

            # On stocke pour visualiser après
            if epoch == 1:
                norms_mean_hist = []
                norms_std_hist = []
                norms_max_hist = []
                norms_min_hist = []

            norms_mean_hist.append(mean_norm)
            norms_std_hist.append(std_norm)
            norms_max_hist.append(max_norm)
            norms_min_hist.append(min_norm)

            print(f"[Epoch {epoch}] Memory norms - Mean: {mean_norm:.4f} | Std: {std_norm:.4f} | Max: {max_norm:.4f} | Min: {min_norm:.4f}")

    print(
    f" Best val: {bestval:.4f} | LR: {lrate:.4f} |\n"
    )





  return disable_fn(*args, **kwargs)


LR = 0.0004
TGNMemory params: 32,384
MultiLayerTimeAwareGNN params: 1,383,232
WinPredictor params: 33,219
Total parameters: 1,448,835


Training:  34%|███▍      | 726/2105 [00:16<00:34, 39.58batch/s]