In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
data = pd.read_csv('pl22.csv', usecols=['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', ])

In [3]:
data.describe()

Unnamed: 0,FTHG,FTAG
count,380.0,380.0
mean,1.513158,1.305263
std,1.326545,1.258836
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,7.0,6.0


In [4]:
data.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,Brentford,Arsenal,2,0,H
1,Man United,Leeds,5,1,H
2,Burnley,Brighton,1,2,A
3,Chelsea,Crystal Palace,3,0,H
4,Everton,Southampton,3,1,H


In [5]:
teams = pd.concat([data['HomeTeam'], data['AwayTeam']]).unique()
teams = pd.DataFrame(teams, columns=['Team'])
teams_dict = {team: i+1 for i, team in enumerate(teams['Team'])}
teams

Unnamed: 0,Team
0,Brentford
1,Man United
2,Burnley
3,Chelsea
4,Everton
5,Leicester
6,Watford
7,Norwich
8,Newcastle
9,Tottenham


In [6]:
data['HomeTeam'] = data['HomeTeam'].map(teams_dict)
data['AwayTeam'] = data['AwayTeam'].map(teams_dict)

In [8]:
for index, row in data.iterrows():
    if row['FTR'] == 'H':
        data.at[index, 'FTR'] = row['HomeTeam']
    elif row['FTR'] == 'D':
        data.at[index, 'FTR'] = 0
    elif row['FTR'] == 'A':
        data.at[index, 'FTR'] = row['AwayTeam']

In [9]:
data.rename(columns={"FTHG": "HomeGoals", "FTAG": "AwayGoals", "FTR": "Winner"}, inplace=True)

In [10]:
team_stats_list = []

for team_name in teams['Team']:
    team_id = teams_dict[team_name]  # Get TeamID from the mapping
    gf_home, ga_home, gf_away, ga_away, wins_home, draws_home, losses_home, wins_away, draws_away, losses_away = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

    gf, ga, wins, draws, losses = 0, 0, 0, 0, 0

    for _, match in data.iterrows():
    # Check home matches
        if match['HomeTeam'] == team_id:
            gf_home += match['HomeGoals']
            ga_home += match['AwayGoals']
            
            if match['Winner'] == team_id:
                wins_home += 1
            elif match['Winner'] == 0:
                draws_home += 1
            else:
                losses_home += 1
                
        # Check away matches
        elif match['AwayTeam'] == team_id:
            gf_away += match['AwayGoals']
            ga_away += match['HomeGoals']
            
            if match['Winner'] == team_id:
                wins_away += 1
            elif match['Winner'] == 0:
                draws_away += 1
            else:
                losses_away += 1

    
    # Append team stats as a dictionary
    team_stats_list.append({
        "Team": team_name,
        "TeamID": team_id,
        "GoalsForHome": gf_home,
        "GoalsAgainstHome": ga_home,
        "GoalsForAway": gf_away,
        "GoalsAgainstAway": ga_away,
        "WinsHome": wins_home,
        "DrawsHome": draws_home,
        "LossesHome": losses_home,
        "WinsAway": wins_away,
        "DrawsAway": draws_away,
        "LossesAway": losses_away,
        "GoalsFor": gf_home + gf_away,
        "GoalsAgainst": ga_home + ga_away,
        "GoalsDifference": gf_home + gf_away - ga_home - ga_away,
        "Wins": wins_home + wins_away,
        "Draws": draws_home + draws_away,
        "Losses": losses_home + losses_away
    })

In [11]:
team_stats = pd.DataFrame(team_stats_list)

In [12]:
standings = pd.read_csv('pl22-table.csv')
standings

Unnamed: 0,Rk,Squad,Pts,Pts/MP,xG,xGA,xGD,xGD/90
0,1,Manchester City,93,2.45,88.7,24.6,64.0,1.68
1,2,Liverpool,92,2.42,88.7,33.8,54.9,1.45
2,3,Chelsea,74,1.95,63.4,33.2,30.1,0.79
3,4,Tottenham,71,1.87,61.2,39.3,21.9,0.58
4,5,Arsenal,69,1.82,60.5,45.7,14.9,0.39
5,6,Manchester Utd,58,1.53,55.8,53.0,2.8,0.07
6,7,West Ham,56,1.47,51.4,53.5,-2.1,-0.06
7,8,Leicester City,52,1.37,47.8,59.3,-11.5,-0.3
8,9,Brighton,51,1.34,46.2,42.9,3.3,0.09
9,10,Wolves,51,1.34,37.5,56.9,-19.4,-0.51


In [13]:
teams_dict

{'Brentford': 1,
 'Man United': 2,
 'Burnley': 3,
 'Chelsea': 4,
 'Everton': 5,
 'Leicester': 6,
 'Watford': 7,
 'Norwich': 8,
 'Newcastle': 9,
 'Tottenham': 10,
 'Liverpool': 11,
 'Aston Villa': 12,
 'Crystal Palace': 13,
 'Leeds': 14,
 'Man City': 15,
 'Brighton': 16,
 'Southampton': 17,
 'Wolves': 18,
 'Arsenal': 19,
 'West Ham': 20}

In [14]:
squad_dict = {
    'Liverpool': "Liverpool",
    'Manchester City': "Man City",
    "Arsenal": "Arsenal",
    'Chelsea': "Chelsea",
    'Newcastle Utd': "Newcastle",
    'Aston Villa': "Aston Villa",
    "Burnley": "Burnley",
    'Brighton': "Brighton",
    'Norwich City': "Norwich",
    'Brentford': "Brentford",
    'Watford': "Watford",
    'Crystal Palace': "Crystal Palace",
    'Everton': "Everton",
    'West Ham': "West Ham",
    'Manchester Utd': "Man United",
    'Wolves': "Wolves",
    'Tottenham': "Tottenham",
    'Southampton': 'Southampton',
    'Leeds United': 'Leeds',
    'Leicester City': 'Leicester',
}

for index, row in standings.iterrows():
    if row['Squad'] in squad_dict:
        standings.at[index, 'Squad'] = squad_dict[row['Squad']]

In [15]:
standings

Unnamed: 0,Rk,Squad,Pts,Pts/MP,xG,xGA,xGD,xGD/90
0,1,Man City,93,2.45,88.7,24.6,64.0,1.68
1,2,Liverpool,92,2.42,88.7,33.8,54.9,1.45
2,3,Chelsea,74,1.95,63.4,33.2,30.1,0.79
3,4,Tottenham,71,1.87,61.2,39.3,21.9,0.58
4,5,Arsenal,69,1.82,60.5,45.7,14.9,0.39
5,6,Man United,58,1.53,55.8,53.0,2.8,0.07
6,7,West Ham,56,1.47,51.4,53.5,-2.1,-0.06
7,8,Leicester,52,1.37,47.8,59.3,-11.5,-0.3
8,9,Brighton,51,1.34,46.2,42.9,3.3,0.09
9,10,Wolves,51,1.34,37.5,56.9,-19.4,-0.51


In [16]:
for idx, team in team_stats.iterrows():
    for i,row in standings.iterrows():
        if team['Team'] == row['Squad']:
            team_stats.at[idx, "Pts"] = row["Pts"]
            team_stats.at[idx, "Pts/MP"] = row["Pts/MP"]
            team_stats.at[idx, "xG"] = row["xG"]
            team_stats.at[idx, "xGA"] = row["xGA"]
            team_stats.at[idx, "xGD"] = row["xGD"]
            team_stats.at[idx, "xGD/90"] = row["xGD/90"]
            team_stats.at[idx, "Rank"] = row["Rk"]
            

team_stats["Pts"] = team_stats["Pts"].astype('Int64')
team_stats["Rank"] = team_stats["Rank"].astype('Int64')

In [18]:
with open('data/performances2022.json', 'r', encoding='utf-8') as f:
    sofascore_json = json.load(f)
teams_rates = {}

for team in sofascore_json["topTeams"]["avgRating"]:
    team_name = team['team']["name"]
    teams_rates[team_name] = round(team['statistics']['avgRating'],2)


teams_rates

{'Man City': 7.18,
 'Liverpool': 7.16,
 'Chelsea': 7.06,
 'Tottenham': 7.02,
 'Brighton': 6.92,
 'West Ham': 6.91,
 'Man United': 6.9,
 'Arsenal': 6.9,
 'Wolves': 6.89,
 'Crystal Palace': 6.87,
 'Leicester': 6.86,
 'Aston Villa': 6.86,
 'Burnley': 6.86,
 'Brentford': 6.84,
 'Southampton': 6.8,
 'Newcastle': 6.79,
 'Everton': 6.78,
 'Leeds': 6.75,
 'Watford': 6.71,
 'Norwich': 6.68}

In [19]:
team_stats['Rating'] = team_stats['Team'].map(teams_rates)

In [20]:
team_stats

Unnamed: 0,Team,TeamID,GoalsForHome,GoalsAgainstHome,GoalsForAway,GoalsAgainstAway,WinsHome,DrawsHome,LossesHome,WinsAway,...,Draws,Losses,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Rank,Rating
0,Brentford,1,22,21,26,35,7,3,9,6,...,7,18,46,1.21,45.8,48.5,-2.7,-0.07,13,6.84
1,Man United,2,32,22,25,35,10,5,4,6,...,10,12,58,1.53,55.8,53.0,2.8,0.07,6,6.9
2,Burnley,3,18,25,16,28,5,6,8,2,...,14,17,35,0.92,39.7,57.1,-17.4,-0.46,18,6.86
3,Chelsea,4,37,22,39,11,9,7,3,12,...,11,6,74,1.95,63.4,33.2,30.1,0.79,3,7.06
4,Everton,5,27,25,16,41,9,2,8,2,...,6,21,39,1.03,41.2,55.4,-14.2,-0.37,16,6.78
5,Leicester,6,34,23,28,36,10,4,5,4,...,10,14,52,1.37,47.8,59.3,-11.5,-0.3,8,6.86
6,Watford,7,17,46,17,31,2,2,15,4,...,5,27,23,0.61,40.0,64.5,-24.5,-0.64,19,6.71
7,Norwich,8,12,43,11,41,3,3,13,2,...,7,26,22,0.58,32.3,75.5,-43.2,-1.14,20,6.68
8,Newcastle,9,26,27,18,35,8,6,5,5,...,10,15,49,1.29,38.1,57.1,-19.0,-0.5,11,6.79
9,Tottenham,10,38,19,31,21,13,1,5,9,...,5,11,71,1.87,61.2,39.3,21.9,0.58,4,7.02


In [21]:
team_stats.columns

Index(['Team', 'TeamID', 'GoalsForHome', 'GoalsAgainstHome', 'GoalsForAway',
       'GoalsAgainstAway', 'WinsHome', 'DrawsHome', 'LossesHome', 'WinsAway',
       'DrawsAway', 'LossesAway', 'GoalsFor', 'GoalsAgainst',
       'GoalsDifference', 'Wins', 'Draws', 'Losses', 'Pts', 'Pts/MP', 'xG',
       'xGA', 'xGD', 'xGD/90', 'Rank', 'Rating'],
      dtype='object')

In [22]:
team_stats.to_csv("data/pl21-22.csv", index=False)