# 02. Extract team statistics

### Imports

In [2]:
import pandas as pd
import numpy as np

### Load processed training dataset

In [3]:
training_set = pd.read_parquet("datasets/processed/training_set_processed.parquet")

### Extract all teams in training set

In [4]:
home_teams_list = training_set["HT"].unique().tolist()
away_teams_list = training_set["AT"].unique().tolist()
teams_list = list(set(home_teams_list + away_teams_list))
print("Number of teams in training set:",len(teams_list))

Number of teams in training set: 1637


In [5]:
teams_list

['Assyriska',
 'Sagamihara',
 'Halmstads BK',
 'Espanyol Barcelona',
 'Kuban Krasnodar',
 'Ponte Preta',
 'Forfar Athletic',
 'Vallecano',
 'Yaracuyanos FC',
 'Zenit St Petersburg',
 'FC Thun',
 'TSV Hartberg',
 'Werder Bremen',
 'Olympique Marseille',
 'FC Lugano',
 'Brisbane Roar',
 'Chengdu Blades',
 'Atletico Celaya',
 'Sanfrecce',
 'Strasbourg',
 'Celta de Vigo',
 'Detroit City FC',
 'AS Lyon-Duchere',
 'Grimsby',
 'Dordrecht',
 'Sandefjord Fotball',
 'Gyeongnam FC',
 'Gillingham',
 'Viktoria Koln',
 'Venezia',
 'Lobos',
 'SCR Altach',
 'CD Provincial Osorno',
 'Blaublitz',
 'Rochester Rhinos',
 'Pacos Ferreira',
 'Neuchatel Xamax',
 'Cordoba',
 'USM Bel Abbes',
 'Universidad Central FC',
 'Eastern Pride',
 'FC Basel',
 'Queensland Roar',
 'YoungHeart Manawatu',
 'Sheffield Wednesday',
 'Manchester City',
 'Pontevedra',
 'Tacoma Defiance',
 'Maritzburg United',
 'HJK Helsinki',
 'Bath City',
 'Mainz 05 II',
 'Chiapas FC',
 'Wolverhampton',
 'Nizhny Novgorod',
 'New England Revolut

### Functions to gather each team statistics 

In [6]:
def extract_team_df(team_name):
    ''' Extracts games of a specific team from whole training set.
    '''
    
    home_team_df = training_set[training_set["HT"] == team_name]
    away_team_df = training_set[training_set["AT"] == team_name]
        
    team_df = pd.concat([home_team_df, away_team_df]).sort_values(by="Date")
    
    return team_df, home_team_df, away_team_df

In [7]:
def get_stats(df, local=True):
    '''Fills in team statistics dictionary.
    '''
    team_stats = dict()
    if local is True:
        prefix = "home"
        goals_scored_col = "HS"
        goals_against_col = "AS"
        win, loss = "W", "L"
        goals_difference = df["GD"].sum()
        try:
            avg_goals_difference = round(df["GD"].sum() / df.shape[0], 3)
        except ZeroDivisionError:
            avg_goals_difference = np.nan
    else:
        prefix = "away"
        goals_scored_col = "AS"
        goals_against_col = "HS"
        win, loss = "L", "W"
        goals_difference = - df["GD"].sum()
        try:
            avg_goals_difference = - round(df["GD"].sum() / df.shape[0], 3)
        except ZeroDivisionError:
            avg_goals_difference = np.nan

    team_stats[f"{prefix}_games"] = df.shape[0]
    
    team_stats[f"{prefix}_wins"] = df["WDL"].value_counts().get(win, 0)
    team_stats[f"{prefix}_draws"] = df["WDL"].value_counts().get("D", 0)
    team_stats[f"{prefix}_losses"] = df["WDL"].value_counts().get(loss, 0)
    
    try:
        team_stats[f"{prefix}_win_percentage"] = round(df["WDL"].value_counts().get(win, 0) / df.shape[0], 3)
    except ZeroDivisionError:
        team_stats[f"{prefix}_win_percentage"] = np.nan
    try:
        team_stats[f"{prefix}_draw_percentage"] = round(df["WDL"].value_counts().get("D", 0) / df.shape[0], 3)
    except ZeroDivisionError:
        team_stats[f"{prefix}_draw_percentage"] = np.nan
    try:
        team_stats[f"{prefix}_loss_percentage"] = round(df["WDL"].value_counts().get(loss, 0) / df.shape[0], 3)
    except ZeroDivisionError:
        team_stats[f"{prefix}_loss_percentage"] = np.nan
    team_stats[f"{prefix}_goals_scored"] = df[goals_scored_col].sum()
    team_stats[f"{prefix}_goals_against"] = df[goals_against_col].sum()
    team_stats[f"{prefix}_goals_difference"] = goals_difference
    try:
        team_stats[f"{prefix}_avg_goals_scored"] = round(df[goals_scored_col].sum() / df.shape[0], 3)
    except ZeroDivisionError:
        team_stats[f"{prefix}_avg_goals_scored"] = np.nan
    try:
        team_stats[f"{prefix}_avg_goals_against"] = round(df[goals_against_col].sum() / df.shape[0], 3)
    except ZeroDivisionError:
        team_stats[f"{prefix}_avg_goals_against"] = np.nan
    team_stats[f"{prefix}_avg_goals_difference"] = avg_goals_difference

    return team_stats

### Basic function to fill in each team dict with specific data

In [8]:
def complete_team_info(team_name):

    # Filter games only of specified team
    team_df, home_team_df, away_team_df = extract_team_df(team_name)
        
    # Create team dict
    team_stats = dict()

    # Recover home and away teams separately
    home_stats = get_stats(home_team_df, local=True)
    away_stats = get_stats(away_team_df, local=False)

    # Add home and away stats
    team_stats_temp = home_stats | away_stats

    # Add general stats
    team_stats["name"] = team_name
    team_stats["total_games"] = team_df.shape[0]
    team_stats["wins"] = team_stats_temp["home_wins"] + team_stats_temp["away_wins"]
    team_stats["draws"] = team_stats_temp["home_draws"] + team_stats_temp["away_draws"]
    team_stats["losses"] = team_stats_temp["home_losses"] + team_stats_temp["away_losses"]
    team_stats["win_percentage"] = round(team_stats["wins"] / team_df.shape[0], 3)
    team_stats["draw_percentage"] = round(team_stats["draws"] / team_df.shape[0], 3)
    team_stats["loss_percentage"] = round(team_stats["losses"] / team_df.shape[0], 3)
    team_stats["goals_scored"] = team_stats_temp["home_goals_scored"] + team_stats_temp["away_goals_scored"]
    team_stats["goals_against"] = team_stats_temp["home_goals_against"] + team_stats_temp["away_goals_against"]
    team_stats["goals_difference"] = team_stats["goals_scored"] - team_stats["goals_against"]
    team_stats["avg_goals_scored"] = round(team_stats["goals_scored"] / team_df.shape[0], 3)
    team_stats["avg_goals_against"] = round(team_stats["goals_against"] / team_df.shape[0], 3)
    team_stats["avg_goals_difference"] = round(team_stats["goals_difference"] / team_df.shape[0], 3)
    
    team_stats = team_stats | home_stats | away_stats
       
    return team_stats

### Complete each team info 

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
teams = list()

# For each team, complete its info...
for team in teams_list:
    team_stats = complete_team_info(team_name=team)
    teams.append(team_stats)

### Check list with all teams dicts info isolated

In [11]:
teams

# with open("teams.pickle") as teams_file:
#    pickle.dump(teams, teams_file)

[{'name': 'Assyriska',
  'total_games': 26,
  'wins': 4,
  'draws': 2,
  'losses': 20,
  'win_percentage': 0.154,
  'draw_percentage': 0.077,
  'loss_percentage': 0.769,
  'goals_scored': 17,
  'goals_against': 52,
  'goals_difference': -35,
  'avg_goals_scored': 0.654,
  'avg_goals_against': 2.0,
  'avg_goals_difference': -1.346,
  'home_games': 13,
  'home_wins': 1,
  'home_draws': 1,
  'home_losses': 11,
  'home_win_percentage': 0.077,
  'home_draw_percentage': 0.077,
  'home_loss_percentage': 0.846,
  'home_goals_scored': 5,
  'home_goals_against': 25,
  'home_goals_difference': -20,
  'home_avg_goals_scored': 0.385,
  'home_avg_goals_against': 1.923,
  'home_avg_goals_difference': -1.538,
  'away_games': 13,
  'away_wins': 3,
  'away_draws': 1,
  'away_losses': 9,
  'away_win_percentage': 0.231,
  'away_draw_percentage': 0.077,
  'away_loss_percentage': 0.692,
  'away_goals_scored': 12,
  'away_goals_against': 27,
  'away_goals_difference': -15,
  'away_avg_goals_scored': 0.923,
 

### Function to search H2H games (team1 vs team2)

In [12]:
def search_H2H(team1, team2):
    
    condition_a = ((training_set["HT"] == team1) & (training_set["AT"] == team2))
    condition_b = ((training_set["HT"] == team2) & (training_set["AT"] == team1))
    
    h2h = training_set.loc[condition_a | condition_b]
    
    return h2h

In [13]:
search_H2H("Real Madrid", "Eibar")

Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Goals
174450,14-15,SPA1,2014-11-22,Eibar,Real Madrid,0,4,-4,L,4
174640,14-15,SPA1,2015-04-11,Real Madrid,Eibar,3,0,3,W,3
189935,15-16,SPA1,2015-11-29,Eibar,Real Madrid,0,2,-2,L,2
190122,15-16,SPA1,2016-04-09,Real Madrid,Eibar,4,0,4,W,4
212361,16-17,SPA1,2016-10-02,Real Madrid,Eibar,1,1,0,D,2
212545,16-17,SPA1,2017-03-04,Eibar,Real Madrid,1,4,-3,L,5
219409,17-18,SPA1,2017-10-22,Real Madrid,Eibar,3,0,3,W,3
219593,17-18,SPA1,2018-03-10,Eibar,Real Madrid,1,2,-1,L,3
224852,18-19,SPA1,2018-11-24,Eibar,Real Madrid,3,0,3,W,3
225032,18-19,SPA1,2019-04-06,Real Madrid,Eibar,2,1,1,W,3


### Function to get last N games of specific team (streak)

In [69]:
def get_last_matches_info(team_name, n_previous_matches=5):
    ''' Retrieves info of last N games of a team: last games,
    WDL streak of the team in that games and points (5 last games by default)
    '''

    team_df, _, _= extract_team_df(team_name)
    last_n_games_df = team_df[-n_previous_matches:]
    
    # Print team streak in last n games
    streak = list()
    for i, game in last_n_games_df.iterrows():
        if (game["WDL"] == "W" and game["HT"] == team_name) or (game["WDL"] == "L" and game["AT"] == team_name):
            streak.append("W")
        elif game["WDL"] == "D":
            streak.append("D")
        else:
            streak.append("L")
    
    # Get points in that streak
    points = streak.count("W") * 3 + streak.count("D") 

    return last_n_games_df, streak, points

In [70]:
last_n_matches, streak, points = get_last_matches_info("Real Madrid", 5)
print(f"Streak: {streak}. Points: {points}")
last_n_matches

Streak: ['L', 'W', 'D', 'W', 'L']. Points: 7


Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,Goals
298599,22-23,SPA1,2023-01-07,Villarreal,Real Madrid,2,1,1,W,3
298623,22-23,SPA1,2023-01-22,Athletic Bilbao,Real Madrid,0,2,-2,L,2
298633,22-23,SPA1,2023-01-29,Real Madrid,Real Sociedad,0,0,0,D,0
298636,22-23,SPA1,2023-02-02,Real Madrid,Valencia,2,0,2,W,2
298642,22-23,SPA1,2023-02-05,Mallorca,Real Madrid,1,0,1,W,1


### Get WDL in the current season for a team

In [75]:
def get_season_points(team_name, season="22-23"):
    ''' Computes points of the team in a specific season. If season
    is not provided, computes points from current season (22-23).
    '''
    
    seasons = training_set["Sea"].value_counts().index.tolist()
    
    if season in seasons:
        team_df, _, _ = extract_team_df(team_name)   
        season_team_df = team_df[team_df["Sea"] == season]
        season_games = season_team_df.shape[0]
        
        wins = len(season_team_df[(season_team_df['HT'] == team) & (season_team_df['WDL']=="W")]) + \
            len(season_team_df[(season_team_df['AT'] == team) & (season_team_df['WDL']=="L")])
        draws = len(season_team_df[season_team_df['WDL'] == "D"])

        season_points = 3 * wins + draws
        avg_points_season = round(season_points / season_games, 3)
        
        return season_points, avg_points_season
    
    else:
        raise ValueError("Invalid season.")

In [80]:
season_points, avg_points_season = get_season_points("Real Madrid", "22-23")
print(f"Total points of the team in season: {season_points} (average points: {avg_points_season})")

Total points of the team in season: 45 (average points: 2.25)


------------------

### **Load teams file from pickle**

In [None]:
with open("teams.pickle", "rb") as teams_file:
    teams = pickle.load(teams_file)