In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

print("‚öΩ FEATURE ENGINEERING")
print("="*60)

# Charger les donn√©es
df = pd.read_csv('../data/raw/premier_league_matches.csv')
df['date'] = pd.to_datetime(df['date'])
print(f"‚úÖ Loaded {len(df)} matches")
print(f"üìÖ From {df['date'].min().date()} to {df['date'].max().date()}")
print(f"‚öΩ {df['home_team'].nunique()} unique teams")
print(f"\nüéØ TARGET: Create features to predict match outcomes!")

‚öΩ FEATURE ENGINEERING
‚úÖ Loaded 760 matches
üìÖ From 2023-08-11 to 2025-05-25
‚öΩ 23 unique teams

üéØ TARGET: Create features to predict match outcomes!


In [2]:
# ============================================
# FEATURE 1: RECENT FORM (Forme des 5 derniers matchs)
# ============================================

print("üîÑ Calculating recent form (last 5 matches)...")

# On va calculer les points des 5 derniers matchs pour chaque √©quipe
# Victoire = 3 points, Nul = 1 point, D√©faite = 0 points

def calculate_form(team, date, n_matches=5):
    """
    Calcule les points des N derniers matchs d'une √©quipe AVANT une date donn√©e
    """
    # Matchs de cette √©quipe (home ou away) AVANT cette date
    team_matches = df[
        ((df['home_team'] == team) | (df['away_team'] == team)) & 
        (df['date'] < date)
    ].sort_values('date', ascending=False).head(n_matches)
    
    if len(team_matches) == 0:
        return 0  # D√©but de saison, pas d'historique
    
    points = 0
    for _, match in team_matches.iterrows():
        if match['home_team'] == team:
            # Cette √©quipe jouait √† domicile
            if match['home_score'] > match['away_score']:
                points += 3  # Victoire
            elif match['home_score'] == match['away_score']:
                points += 1  # Nul
            # D√©faite = 0 points
        else:
            # Cette √©quipe jouait √† l'ext√©rieur
            if match['away_score'] > match['home_score']:
                points += 3  # Victoire
            elif match['away_score'] == match['home_score']:
                points += 1  # Nul
    
    return points

# Tester sur le premier match
test_match = df.iloc[50]  # Match random
print(f"\nüß™ TEST sur: {test_match['home_team']} vs {test_match['away_team']}")
print(f"Date: {test_match['date']}")

home_form = calculate_form(test_match['home_team'], test_match['date'])
away_form = calculate_form(test_match['away_team'], test_match['date'])

print(f"Home form (last 5): {home_form} points")
print(f"Away form (last 5): {away_form} points")
print(f"\nüí° Form difference: {home_form - away_form} (positive = home advantage)")

üîÑ Calculating recent form (last 5 matches)...

üß™ TEST sur: Luton Town FC vs Wolverhampton Wanderers FC
Date: 2023-09-23 14:00:00+00:00
Home form (last 5): 0 points
Away form (last 5): 3 points

üí° Form difference: -3 (positive = home advantage)


In [3]:
# ============================================
# APPLIQUER LA FORME √Ä TOUS LES MATCHS
# ============================================

print("‚è≥ Calculating form for all 760 matches...")
print("(This might take 30-60 seconds...)\n")

# Cr√©er les colonnes
df['home_form'] = 0
df['away_form'] = 0

# Calculer pour chaque match
for idx, match in df.iterrows():
    df.loc[idx, 'home_form'] = calculate_form(match['home_team'], match['date'])
    df.loc[idx, 'away_form'] = calculate_form(match['away_team'], match['date'])
    
    # Progress bar simple
    if (idx + 1) % 100 == 0:
        print(f"‚úÖ Processed {idx + 1}/760 matches...")

# Cr√©er la feature "form difference"
df['form_diff'] = df['home_form'] - df['away_form']

print(f"\nüéâ DONE! Form calculated for all matches!")
print(f"\nüìä FORM STATISTICS:")
print(df[['home_form', 'away_form', 'form_diff']].describe())

print(f"\nüëÄ Sample with form:")
print(df[['home_team', 'away_team', 'home_form', 'away_form', 'form_diff']].head(10))

‚è≥ Calculating form for all 760 matches...
(This might take 30-60 seconds...)

‚úÖ Processed 100/760 matches...
‚úÖ Processed 200/760 matches...
‚úÖ Processed 300/760 matches...
‚úÖ Processed 400/760 matches...
‚úÖ Processed 500/760 matches...
‚úÖ Processed 600/760 matches...
‚úÖ Processed 700/760 matches...

üéâ DONE! Form calculated for all matches!

üìä FORM STATISTICS:
        home_form   away_form   form_diff
count  760.000000  760.000000  760.000000
mean     6.540789    6.743421   -0.202632
std      3.721308    3.676828    5.003924
min      0.000000    0.000000  -12.000000
25%      4.000000    4.000000   -3.250000
50%      6.000000    7.000000    0.000000
75%      9.000000   10.000000    3.000000
max     15.000000   15.000000   15.000000

üëÄ Sample with form:
                   home_team                   away_team  home_form  \
0                 Burnley FC          Manchester City FC          0   
1                 Arsenal FC        Nottingham Forest FC          0   
2     

In [4]:
# V√©rifier des matchs plus tard dans la saison
print("üîç Checking form for matches later in the season:\n")

# Match 100 (environ journ√©e 13)
print("Match 100:")
print(df.iloc[100][['date', 'home_team', 'away_team', 'home_form', 'away_form', 'form_diff']])

print("\n" + "="*50 + "\n")

# Match 300 (milieu de saison)
print("Match 300:")
print(df.iloc[300][['date', 'home_team', 'away_team', 'home_form', 'away_form', 'form_diff']])

print("\n" + "="*50 + "\n")

# Match 500 (fin de saison)
print("Match 500:")
print(df.iloc[500][['date', 'home_team', 'away_team', 'home_form', 'away_form', 'form_diff']])

print("\n" + "="*50 + "\n")

# Statistiques: combien de matchs ont form = 0?
zero_form = df[(df['home_form'] == 0) & (df['away_form'] == 0)]
print(f"üìä Matchs avec form=0 pour les deux √©quipes: {len(zero_form)}/760")
print(f"üí° C'est normal au d√©but de saison!")

# Moyenne de la forme (excluant les 0)
non_zero = df[(df['home_form'] > 0) | (df['away_form'] > 0)]
print(f"\nüìà Stats pour matchs avec historique ({len(non_zero)} matchs):")
print(non_zero[['home_form', 'away_form', 'form_diff']].describe())

üîç Checking form for matches later in the season:

Match 100:
date         2023-11-04 12:30:00+00:00
home_team                    Fulham FC
away_team         Manchester United FC
home_form                            5
away_form                            9
form_diff                           -4
Name: 100, dtype: object


Match 300:
date         2024-04-03 19:15:00+00:00
home_team           Manchester City FC
away_team               Aston Villa FC
home_form                           11
away_form                           10
form_diff                            1
Name: 300, dtype: object


Match 500:
date         2024-11-29 20:00:00+00:00
home_team    Brighton & Hove Albion FC
away_team               Southampton FC
home_form                           10
away_form                            3
form_diff                            7
Name: 500, dtype: object


üìä Matchs avec form=0 pour les deux √©quipes: 14/760
üí° C'est normal au d√©but de saison!

üìà Stats pour matchs avec historiq

In [5]:
# ============================================
# FEATURE 2: GOALS TRENDS (Derniers 5 matchs)
# ============================================

print("‚öΩ Calculating goals scored & conceded trends...\n")

def calculate_goals_stats(team, date, is_home, n_matches=5):
    """
    Calcule les buts marqu√©s et encaiss√©s des N derniers matchs
    
    Returns: (goals_scored, goals_conceded)
    """
    # Matchs AVANT cette date
    if is_home:
        team_matches = df[
            (df['home_team'] == team) & (df['date'] < date)
        ].sort_values('date', ascending=False).head(n_matches)
    else:
        team_matches = df[
            (df['away_team'] == team) & (df['date'] < date)
        ].sort_values('date', ascending=False).head(n_matches)
    
    if len(team_matches) == 0:
        return 0, 0
    
    goals_scored = 0
    goals_conceded = 0
    
    for _, match in team_matches.iterrows():
        if is_home:
            goals_scored += match['home_score']
            goals_conceded += match['away_score']
        else:
            goals_scored += match['away_score']
            goals_conceded += match['home_score']
    
    # Moyenne par match
    avg_scored = goals_scored / len(team_matches)
    avg_conceded = goals_conceded / len(team_matches)
    
    return avg_scored, avg_conceded

# Test rapide
test_match = df.iloc[300]  # Man City vs Aston Villa
print(f"üß™ TEST: {test_match['home_team']} vs {test_match['away_team']}")
print(f"Date: {test_match['date']}\n")

home_scored, home_conceded = calculate_goals_stats(
    test_match['home_team'], 
    test_match['date'], 
    is_home=True
)
away_scored, away_conceded = calculate_goals_stats(
    test_match['away_team'], 
    test_match['date'], 
    is_home=False
)

print(f"Man City (home) last 5 home matches:")
print(f"  ‚öΩ Scored: {home_scored:.2f} goals/match")
print(f"  üõ°Ô∏è  Conceded: {home_conceded:.2f} goals/match")

print(f"\nAston Villa (away) last 5 away matches:")
print(f"  ‚öΩ Scored: {away_scored:.2f} goals/match")
print(f"  üõ°Ô∏è  Conceded: {away_conceded:.2f} goals/match")

print(f"\nüí° Offensive advantage: {home_scored - away_scored:.2f}")
print(f"üí° Defensive advantage: {away_conceded - home_conceded:.2f}")

‚öΩ Calculating goals scored & conceded trends...

üß™ TEST: Manchester City FC vs Aston Villa FC
Date: 2024-04-03 19:15:00+00:00

Man City (home) last 5 home matches:
  ‚öΩ Scored: 1.40 goals/match
  üõ°Ô∏è  Conceded: 0.40 goals/match

Aston Villa (away) last 5 away matches:
  ‚öΩ Scored: 2.20 goals/match
  üõ°Ô∏è  Conceded: 0.80 goals/match

üí° Offensive advantage: -0.80
üí° Defensive advantage: 0.40


In [6]:
# ============================================
# APPLIQUER GOALS STATS √Ä TOUS LES MATCHS
# ============================================

print("‚è≥ Calculating goals stats for all matches...")
print("(This will be FASTER - ~30 seconds)\n")

# Cr√©er les colonnes
df['home_goals_scored_avg'] = 0.0
df['home_goals_conceded_avg'] = 0.0
df['away_goals_scored_avg'] = 0.0
df['away_goals_conceded_avg'] = 0.0

# Calculer pour chaque match
for idx, match in df.iterrows():
    # Home team
    h_scored, h_conceded = calculate_goals_stats(
        match['home_team'], 
        match['date'], 
        is_home=True
    )
    df.loc[idx, 'home_goals_scored_avg'] = h_scored
    df.loc[idx, 'home_goals_conceded_avg'] = h_conceded
    
    # Away team
    a_scored, a_conceded = calculate_goals_stats(
        match['away_team'], 
        match['date'], 
        is_home=False
    )
    df.loc[idx, 'away_goals_scored_avg'] = a_scored
    df.loc[idx, 'away_goals_conceded_avg'] = a_conceded
    
    if (idx + 1) % 100 == 0:
        print(f"‚úÖ Processed {idx + 1}/760...")

# Cr√©er features d√©riv√©es
df['offensive_strength_diff'] = df['home_goals_scored_avg'] - df['away_goals_scored_avg']
df['defensive_strength_diff'] = df['away_goals_conceded_avg'] - df['home_goals_conceded_avg']

print(f"\nüéâ DONE!\n")
print("üìä GOALS FEATURES STATISTICS:")
print(df[['home_goals_scored_avg', 'away_goals_scored_avg', 
          'offensive_strength_diff', 'defensive_strength_diff']].describe())

print(f"\nüëÄ Sample:")
print(df[['home_team', 'away_team', 
          'home_goals_scored_avg', 'away_goals_scored_avg',
          'offensive_strength_diff']].iloc[300:305])

‚è≥ Calculating goals stats for all matches...
(This will be FASTER - ~30 seconds)

‚úÖ Processed 100/760...
‚úÖ Processed 200/760...
‚úÖ Processed 300/760...
‚úÖ Processed 400/760...
‚úÖ Processed 500/760...
‚úÖ Processed 600/760...
‚úÖ Processed 700/760...

üéâ DONE!

üìä GOALS FEATURES STATISTICS:
       home_goals_scored_avg  away_goals_scored_avg  offensive_strength_diff  \
count             760.000000             760.000000                760.00000   
mean                1.618575               1.413004                  0.20557   
std                 0.820154               0.693515                  0.98485   
min                 0.000000               0.000000                 -3.00000   
25%                 1.000000               1.000000                 -0.40000   
50%                 1.600000               1.400000                  0.20000   
75%                 2.200000               1.800000                  0.80000   
max                 5.000000               4.000000     

In [7]:
# ============================================
# TARGET VARIABLE: Match Result
# ============================================

print("üéØ Creating TARGET variable (match result)...\n")

def get_result(row):
    """
    H = Home Win
    D = Draw
    A = Away Win
    """
    if row['home_score'] > row['away_score']:
        return 'H'
    elif row['home_score'] < row['away_score']:
        return 'A'
    else:
        return 'D'

df['result'] = df.apply(get_result, axis=1)

print("‚úÖ Target created!\n")
print("üìä DISTRIBUTION DES R√âSULTATS:")
result_counts = df['result'].value_counts()
print(result_counts)
print(f"\nPourcentages:")
print((result_counts / len(df) * 100).round(2))

print(f"\nüëÄ Sample avec toutes les features:")
print(df[['home_team', 'away_team', 
          'home_form', 'away_form', 'form_diff',
          'offensive_strength_diff', 'defensive_strength_diff',
          'home_score', 'away_score', 'result']].iloc[300:305])

üéØ Creating TARGET variable (match result)...

‚úÖ Target created!

üìä DISTRIBUTION DES R√âSULTATS:
result
H    330
A    255
D    175
Name: count, dtype: int64

Pourcentages:
result
H    43.42
A    33.55
D    23.03
Name: count, dtype: float64

üëÄ Sample avec toutes les features:
              home_team             away_team  home_form  away_form  \
300  Manchester City FC        Aston Villa FC         11         10   
301        Liverpool FC   Sheffield United FC         13          2   
302          Chelsea FC  Manchester United FC          9          7   
303   Crystal Palace FC    Manchester City FC          5         11   
304      Aston Villa FC          Brentford FC          7          3   

     form_diff  offensive_strength_diff  defensive_strength_diff  home_score  \
300          1                     -0.8                      0.4           4   
301         11                      1.4                      0.8           3   
302          2                      0.0        

In [8]:
# ============================================
# FEATURE 3: MARKET VALUE (GAME CHANGER!)
# ============================================

print("üí∞ Adding Market Value Feature \n")

# Valeurs marchandes des √©quipes (en millions ‚Ç¨)
# Source: Transfermarkt Ao√ªt 2023
team_values = {
    # Top teams
    'Manchester City FC': 1280,
    'Arsenal FC': 1120,
    'Liverpool FC': 1000,
    'Chelsea FC': 960,
    'Manchester United FC': 880,
    'Tottenham Hotspur FC': 820,
    'Newcastle United FC': 650,
    
    # Mid-table
    'Aston Villa FC': 490,
    'Brighton & Hove Albion FC': 420,
    'Leeds United FC': 420,          # ‚Üê 2022-23
    'West Ham United FC': 380,
    'Leicester City FC': 360,        # ‚Üê 2022-23
    'Crystal Palace FC': 280,
    'Southampton FC': 280,           # ‚Üê 2022-23
    'Wolverhampton Wanderers FC': 270,
    'Fulham FC': 265,
    'AFC Bournemouth': 230,
    'Brentford FC': 225,
    'Nottingham Forest FC': 220,
    'Everton FC': 210,
    
    # Newly promoted 2023-24
    'Luton Town FC': 85,
    'Ipswich Town FC': 95,
    'Sheffield United FC': 82,
    'Burnley FC': 78
}

print("üìä Team Values (Top 5):")
sorted_values = sorted(team_values.items(), key=lambda x: x[1], reverse=True)
for team, value in sorted_values[:5]:
    print(f"  {team}: ‚Ç¨{value}M")

print(f"\nüìä Team Values (Bottom 5):")
for team, value in sorted_values[-5:]:
    print(f"  {team}: ‚Ç¨{value}M")

# Ajouter au DataFrame
df['home_team_value'] = df['home_team'].map(team_values)
df['away_team_value'] = df['away_team'].map(team_values)
df['value_diff'] = df['home_team_value'] - df['away_team_value']

print(f"\n‚úÖ Market value features added!")
print(f"\nüìä Value Difference Stats:")
print(df['value_diff'].describe())

# Exemples
print(f"\nüëÄ Examples:")
examples = [
    ('Liverpool FC', 'Sheffield United FC'),
    ('Manchester City FC', 'Burnley FC'),
    ('Luton Town FC', 'Arsenal FC')
]

for home, away in examples:
    match = df[(df['home_team'] == home) & (df['away_team'] == away)]
    if len(match) > 0:
        match = match.iloc[0]
        print(f"\n{home} vs {away}")
        print(f"  Value diff: ‚Ç¨{match['value_diff']}M")
        print(f"  Result: {match['result']}")

üí∞ Adding Market Value Feature 

üìä Team Values (Top 5):
  Manchester City FC: ‚Ç¨1280M
  Arsenal FC: ‚Ç¨1120M
  Liverpool FC: ‚Ç¨1000M
  Chelsea FC: ‚Ç¨960M
  Manchester United FC: ‚Ç¨880M

üìä Team Values (Bottom 5):
  Everton FC: ‚Ç¨210M
  Ipswich Town FC: ‚Ç¨95M
  Luton Town FC: ‚Ç¨85M
  Sheffield United FC: ‚Ç¨82M
  Burnley FC: ‚Ç¨78M

‚úÖ Market value features added!

üìä Value Difference Stats:
count     760.000000
mean        0.000000
std       522.068729
min     -1202.000000
25%      -331.250000
50%         0.000000
75%       331.250000
max      1202.000000
Name: value_diff, dtype: float64

üëÄ Examples:

Liverpool FC vs Sheffield United FC
  Value diff: ‚Ç¨918M
  Result: H

Manchester City FC vs Burnley FC
  Value diff: ‚Ç¨1202M
  Result: H

Luton Town FC vs Arsenal FC
  Value diff: ‚Ç¨-1035M
  Result: A


In [9]:
# ============================================
# FEATURE 4: WIN RATES HOME/AWAY
# ============================================

print("üèÜ Calculating Win Rates (Home/Away split)...\n")

def calculate_win_rate_home(team, date, n_matches=10):
    """
    Calcule le % de victoires √† DOMICILE des N derniers matchs home
    """
    matches = df[
        (df['home_team'] == team) & 
        (df['date'] < date)
    ].sort_values('date', ascending=False).head(n_matches)
    
    if len(matches) == 0:
        return 0.5  # Default: 50% (neutre)
    
    wins = (matches['home_score'] > matches['away_score']).sum()
    return wins / len(matches)

def calculate_win_rate_away(team, date, n_matches=10):
    """
    Calcule le % de victoires √† L'EXT√âRIEUR des N derniers matchs away
    """
    matches = df[
        (df['away_team'] == team) & 
        (df['date'] < date)
    ].sort_values('date', ascending=False).head(n_matches)
    
    if len(matches) == 0:
        return 0.5  # Default: 50% (neutre)
    
    wins = (matches['away_score'] > matches['home_score']).sum()
    return wins / len(matches)

# Test
test_match = df.iloc[300]
print(f"üß™ TEST: {test_match['home_team']} vs {test_match['away_team']}")
print(f"Date: {test_match['date']}\n")

home_wr = calculate_win_rate_home(test_match['home_team'], test_match['date'])
away_wr = calculate_win_rate_away(test_match['away_team'], test_match['date'])

print(f"{test_match['home_team']} (home):")
print(f"  Win rate at home: {home_wr*100:.1f}%")

print(f"\n{test_match['away_team']} (away):")
print(f"  Win rate away: {away_wr*100:.1f}%")

print(f"\nüí° Home advantage: {(home_wr - away_wr)*100:+.1f}%")

üèÜ Calculating Win Rates (Home/Away split)...

üß™ TEST: Manchester City FC vs Aston Villa FC
Date: 2024-04-03 19:15:00+00:00

Manchester City FC (home):
  Win rate at home: 50.0%

Aston Villa FC (away):
  Win rate away: 50.0%

üí° Home advantage: +0.0%


In [10]:
# ============================================
# TEST WIN RATES - PLUSIEURS MATCHS
# ============================================

print("üß™ TESTING WIN RATES ON MULTIPLE MATCHES\n")
print("="*70)

# Test sur 5 matchs diff√©rents
test_indices = [100, 200, 300, 400, 500]

for idx in test_indices:
    match = df.iloc[idx]
    
    home_wr = calculate_win_rate_home(match['home_team'], match['date'])
    away_wr = calculate_win_rate_away(match['away_team'], match['date'])
    
    print(f"\nüìÖ Match {idx}: {match['home_team']} vs {match['away_team']}")
    print(f"Date: {match['date'].date()}")
    print(f"Actual result: {match['result']}")
    print(f"\n  {match['home_team']} (home):")
    print(f"    Win rate at home: {home_wr*100:.1f}%")
    print(f"  {match['away_team']} (away):")
    print(f"    Win rate away: {away_wr*100:.1f}%")
    print(f"  üí° Difference: {(home_wr - away_wr)*100:+.1f}%")
    
    # Pr√©diction bas√©e sur win rate
    if home_wr > away_wr + 0.2:
        prediction = "H (strong)"
    elif home_wr > away_wr:
        prediction = "H (slight)"
    elif away_wr > home_wr + 0.2:
        prediction = "A (strong)"
    elif away_wr > home_wr:
        prediction = "A (slight)"
    else:
        prediction = "D (balanced)"
    
    correct = "‚úÖ" if prediction[0] == match['result'] else "‚ùå"
    print(f"  üéØ Prediction: {prediction} {correct}")
    print("-"*70)

üß™ TESTING WIN RATES ON MULTIPLE MATCHES


üìÖ Match 100: Fulham FC vs Manchester United FC
Date: 2023-11-04
Actual result: A

  Fulham FC (home):
    Win rate at home: 50.0%
  Manchester United FC (away):
    Win rate away: 50.0%
  üí° Difference: +0.0%
  üéØ Prediction: D (balanced) ‚ùå
----------------------------------------------------------------------

üìÖ Match 200: Newcastle United FC vs Manchester City FC
Date: 2024-01-13
Actual result: A

  Newcastle United FC (home):
    Win rate at home: 80.0%
  Manchester City FC (away):
    Win rate away: 60.0%
  üí° Difference: +20.0%
  üéØ Prediction: H (slight) ‚ùå
----------------------------------------------------------------------

üìÖ Match 300: Manchester City FC vs Aston Villa FC
Date: 2024-04-03
Actual result: H

  Manchester City FC (home):
    Win rate at home: 50.0%
  Aston Villa FC (away):
    Win rate away: 50.0%
  üí° Difference: +0.0%
  üéØ Prediction: D (balanced) ‚ùå
----------------------------------------

In [11]:
# ============================================
# APPLIQUER WIN RATES √Ä TOUS LES MATCHS
# ============================================

print("‚è≥ Calculating win rates for all 760 matches...")
print("(This will take ~1-2 minutes)\n")

df['home_win_rate'] = 0.0
df['away_win_rate'] = 0.0

for idx, match in df.iterrows():
    df.loc[idx, 'home_win_rate'] = calculate_win_rate_home(match['home_team'], match['date'])
    df.loc[idx, 'away_win_rate'] = calculate_win_rate_away(match['away_team'], match['date'])
    
    if (idx + 1) % 100 == 0:
        print(f"‚úÖ Processed {idx + 1}/760...")

df['win_rate_diff'] = df['home_win_rate'] - df['away_win_rate']

print(f"\nüéâ DONE!\n")
print("üìä WIN RATE STATISTICS:")
print(df[['home_win_rate', 'away_win_rate', 'win_rate_diff']].describe())

print(f"\nüëÄ Sample:")
print(df[['home_team', 'away_team', 'home_win_rate', 'away_win_rate', 'win_rate_diff']].iloc[300:305])

‚è≥ Calculating win rates for all 760 matches...
(This will take ~1-2 minutes)

‚úÖ Processed 100/760...
‚úÖ Processed 200/760...
‚úÖ Processed 300/760...
‚úÖ Processed 400/760...
‚úÖ Processed 500/760...
‚úÖ Processed 600/760...
‚úÖ Processed 700/760...

üéâ DONE!

üìä WIN RATE STATISTICS:
       home_win_rate  away_win_rate  win_rate_diff
count     760.000000      760.00000     760.000000
mean        0.441937        0.34031       0.101626
std         0.251861        0.21174       0.313198
min         0.000000        0.00000      -1.000000
25%         0.296429        0.20000      -0.100000
50%         0.400000        0.30000       0.100000
75%         0.600000        0.50000       0.300000
max         1.000000        1.00000       1.000000

üëÄ Sample:
              home_team             away_team  home_win_rate  away_win_rate  \
300  Manchester City FC        Aston Villa FC            0.5            0.5   
301        Liverpool FC   Sheffield United FC            0.7            0.1

In [12]:
import numpy as np

print("‚ûï Adding extra league features (points, goal diff, recent form, rest days)...")

# Assure-toi que la date est bien en datetime
df['date'] = pd.to_datetime(df['date'])

# Trier par date pour que les cumsum/rolling soient logiques
df = df.sort_values('date').reset_index(drop=True)

# ============================================
# 1) POINTS PAR MATCH (3/1/0) POUR HOME / AWAY
# ============================================

df['home_points_match'] = np.select(
    [
        df['home_score'] > df['away_score'],   # victoire domicile
        df['home_score'] == df['away_score']   # nul
    ],
    [3, 1],
    default=0
)

df['away_points_match'] = np.select(
    [
        df['away_score'] > df['home_score'],   # victoire ext√©rieur
        df['away_score'] == df['home_score']   # nul
    ],
    [3, 1],
    default=0
)

# ============================================
# 2) POINTS CUMUL√âS AVANT LE MATCH
#    (par √©quipe en jouant √† domicile / ext√©rieur)
# ============================================

df['home_points_cum'] = (
    df.groupby('home_team')['home_points_match']
      .cumsum()
      .shift(1)
      .fillna(0)
)

df['away_points_cum'] = (
    df.groupby('away_team')['away_points_match']
      .cumsum()
      .shift(1)
      .fillna(0)
)

# ============================================
# 3) GOAL DIFFERENCE CUMUL√â AVANT LE MATCH
# ============================================

df['home_goal_diff_match'] = df['home_score'] - df['away_score']
df['away_goal_diff_match'] = df['away_score'] - df['home_score']

df['home_goal_diff_cum'] = (
    df.groupby('home_team')['home_goal_diff_match']
      .cumsum()
      .shift(1)
      .fillna(0)
)

df['away_goal_diff_cum'] = (
    df.groupby('away_team')['away_goal_diff_match']
      .cumsum()
      .shift(1)
      .fillna(0)
)

# ============================================
# 4) RECENT FORM COURTE : POINTS SUR LES 5 DERNIERS MATCHS
#    (mais r√¥le s√©par√© domicile / ext√©rieur)
# ============================================

df['home_recent_points5'] = (
    df.groupby('home_team')['home_points_match']
      .rolling(window=5, min_periods=1)
      .sum()
      .shift(1)
      .reset_index(level=0, drop=True)
)

df['away_recent_points5'] = (
    df.groupby('away_team')['away_points_match']
      .rolling(window=5, min_periods=1)
      .sum()
      .shift(1)
      .reset_index(level=0, drop=True)
)

# ============================================
# 5) REST DAYS : JOURS DE REPOS AVANT LE MATCH
# ============================================

df['prev_home_date'] = df.groupby('home_team')['date'].shift(1)
df['prev_away_date'] = df.groupby('away_team')['date'].shift(1)

df['rest_days_home'] = (df['date'] - df['prev_home_date']).dt.days
df['rest_days_away'] = (df['date'] - df['prev_away_date']).dt.days

# Pour le tout premier match d'une √©quipe, on met une valeur par d√©faut (7 jours)
df['rest_days_home'] = df['rest_days_home'].fillna(7)
df['rest_days_away'] = df['rest_days_away'].fillna(7)

# ============================================
# 6) BIG SIX 
# ============================================
print("‚ûï Adding Big 6 features...")

# ‚ö†Ô∏è Adapter ces noms √† ceux de ton df si besoin
big6_teams = {
    'Arsenal FC',
    'Chelsea FC',
    'Liverpool FC',
    'Manchester City FC',
    'Manchester United FC',
    'Tottenham Hotspur FC'
}

df['home_big6'] = df['home_team'].isin(big6_teams).astype(int)
df['away_big6'] = df['away_team'].isin(big6_teams).astype(int)

print("‚úÖ Big 6 features added: ['home_big6', 'away_big6']")
print(df[['home_team', 'away_team', 'home_big6', 'away_big6']].head(10))



# Nettoyer les colonnes temporaires
df.drop(columns=['prev_home_date', 'prev_away_date'], inplace=True)

print("‚úÖ Extra features added:")
print([
    'home_points_cum', 'away_points_cum',
    'home_goal_diff_cum', 'away_goal_diff_cum',
    'home_recent_points5', 'away_recent_points5',
    'rest_days_home', 'rest_days_away'
])


‚ûï Adding extra league features (points, goal diff, recent form, rest days)...
‚ûï Adding Big 6 features...
‚úÖ Big 6 features added: ['home_big6', 'away_big6']
                   home_team                   away_team  home_big6  away_big6
0                 Burnley FC          Manchester City FC          0          1
1                 Arsenal FC        Nottingham Forest FC          1          0
2            AFC Bournemouth          West Ham United FC          0          0
3  Brighton & Hove Albion FC               Luton Town FC          0          0
4                 Everton FC                   Fulham FC          0          0
5        Sheffield United FC           Crystal Palace FC          0          0
6        Newcastle United FC              Aston Villa FC          0          0
7               Brentford FC        Tottenham Hotspur FC          0          1
8                 Chelsea FC                Liverpool FC          1          1
9       Manchester United FC  Wolverhampton Wand

In [13]:
print(df.columns.tolist())


['date', 'home_team', 'away_team', 'home_score', 'away_score', 'matchday', 'home_form', 'away_form', 'form_diff', 'home_goals_scored_avg', 'home_goals_conceded_avg', 'away_goals_scored_avg', 'away_goals_conceded_avg', 'offensive_strength_diff', 'defensive_strength_diff', 'result', 'home_team_value', 'away_team_value', 'value_diff', 'home_win_rate', 'away_win_rate', 'win_rate_diff', 'home_points_match', 'away_points_match', 'home_points_cum', 'away_points_cum', 'home_goal_diff_match', 'away_goal_diff_match', 'home_goal_diff_cum', 'away_goal_diff_cum', 'home_recent_points5', 'away_recent_points5', 'rest_days_home', 'rest_days_away', 'home_big6', 'away_big6']


In [14]:
print("üíæ Saving dataset with features...\n")

# Colonnes de "metadata" (qu'on ne veut pas comme features)
metadata_cols = [
    'date', 'home_team', 'away_team',
    'home_score', 'away_score', 'matchday', 'result'
]

# Toutes les autres colonnes = features (y compris home_big6, away_big6, etc.)
feature_cols = [c for c in df.columns if c not in metadata_cols]

columns_to_save = metadata_cols + feature_cols

print("‚úÖ Columns to save:")
print(columns_to_save)

df_features = df[columns_to_save].copy()

output_path = '../data/processed/matches_with_features.csv'
df_features.to_csv(output_path, index=False)

print(f"\n‚úÖ Dataset saved to: {output_path}")
print(f"üì¶ Shape: {df_features.shape}")
print(f"üìä Features created: {len(feature_cols)} (excluding metadata)")


üíæ Saving dataset with features...

‚úÖ Columns to save:
['date', 'home_team', 'away_team', 'home_score', 'away_score', 'matchday', 'result', 'home_form', 'away_form', 'form_diff', 'home_goals_scored_avg', 'home_goals_conceded_avg', 'away_goals_scored_avg', 'away_goals_conceded_avg', 'offensive_strength_diff', 'defensive_strength_diff', 'home_team_value', 'away_team_value', 'value_diff', 'home_win_rate', 'away_win_rate', 'win_rate_diff', 'home_points_match', 'away_points_match', 'home_points_cum', 'away_points_cum', 'home_goal_diff_match', 'away_goal_diff_match', 'home_goal_diff_cum', 'away_goal_diff_cum', 'home_recent_points5', 'away_recent_points5', 'rest_days_home', 'rest_days_away', 'home_big6', 'away_big6']

‚úÖ Dataset saved to: ../data/processed/matches_with_features.csv
üì¶ Shape: (760, 36)
üìä Features created: 29 (excluding metadata)
