In [1]:
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import datetime as dt
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_points(data):
    data['df']['home_score'] = pd.to_numeric(data['df']['result'].str.split('-', expand =  True)[0])
    data['df']['away_score'] = pd.to_numeric(data['df']['result'].str.split('-', expand =  True)[1])

    data['df'].loc[data['df']['home_score'] > data['df']['away_score'], ['home_points', 'away_points']] = [3,0]
    data['df'].loc[data['df']['home_score'] < data['df']['away_score'], ['home_points', 'away_points']] = [0,3]
    data['df'].loc[data['df']['home_score'] == data['df']['away_score'], ['home_points', 'away_points']] = [1,1]
    return(data)

def get_rank_difficulty(data):
    data['clubs_list'] = list(set(data['df']['home_team']) | set(data['df']['away_team']))
    data['points_df'] = pd.DataFrame()


    for i, club in enumerate(data['clubs_list']):
        data['temp_df'] = pd.DataFrame()
        club_home_points = data['df'].loc[data['df']['home_team' ] == club, 'home_points'].sum()
        club_away_points = data['df'].loc[data['df']['away_team' ] == club, 'away_points'].sum()

        club_home_games = data['df'].loc[data['df']['home_team' ] == club, 'home_points'].count()
        club_away_games = data['df'].loc[data['df']['away_team' ] == club, 'away_points'].count()


        data['temp_df']['club'] = [club]
        data['temp_df']['games'] = [club_home_games + club_away_games]
        data['temp_df']['points'] = [club_home_points + club_away_points]    

        data['points_df'] = data['points_df'].append(data['temp_df'])

        data['points_df'] = data['points_df'].sort_values(by=['points', 'games'], ascending = [False, True])    
        data['points_df']['difficulty'] = data['points_df']['points'].rank(ascending = True)
        data['points_df']['rank'] = data['points_df']['points'].rank(ascending = False)
        data['points_df'] = data['points_df'].reset_index(drop = True)    
    return(data)

def get_remining_difficulty(data):
    for i, club in enumerate(data['clubs_list']):
        yet_to_play = list(set(data['df'][data['df']['home_team'] == club]['away_team']) | set(data['df'][data['df']['away_team'] == club]['home_team']))
        yet_to_play = [x for x in data['clubs_list'] if x not in yet_to_play]
        yet_to_play.remove(club)

        remaining_difficulty = sum([int(data['points_df'][data['points_df']['club'] == x]['difficulty']) for x in yet_to_play])    
        yet_to_play_num  = [float(data['points_df'][data['points_df']['club'] == x]['rank']) for x in yet_to_play]   
        matches_remaining = len(yet_to_play_num)
        yet_to_play_num.sort()
        
        data['points_df'].loc[data['points_df']['club'] == club, 'remaining_difficulty'] = [remaining_difficulty]    
        data['points_df'].loc[data['points_df']['club'] == club, 'yet_to_play_rank'] = str(yet_to_play_num)
        data['points_df'].loc[data['points_df']['club'] == club, 'avg_rem_difficulty'] = int(remaining_difficulty/matches_remaining)
        data['points_df'].loc[data['points_df']['club'] == club, 'yet_to_play'] = str(yet_to_play)


    return(data)

In [3]:
data = {}

In [4]:
data['df'] = pd.read_csv('data/epl-2022-UTC.csv')
data['df'] = data['df'].dropna()
data['df'].columns = map(str.lower, data['df'].columns)
data['df'].columns = data['df'].columns.str.replace(' ', '_')

In [5]:
data['df'].head(2)

Unnamed: 0,match_number,round_number,date,location,home_team,away_team,result
0,1,1,05/08/2022 19:00,Selhurst Park,Crystal Palace,Arsenal,0 - 2
1,2,1,06/08/2022 11:30,Craven Cottage,Fulham,Liverpool,2 - 2


In [6]:
data = get_points(data)
data['df'].head(2)

Unnamed: 0,match_number,round_number,date,location,home_team,away_team,result,home_score,away_score,home_points,away_points
0,1,1,05/08/2022 19:00,Selhurst Park,Crystal Palace,Arsenal,0 - 2,0,2,0.0,3.0
1,2,1,06/08/2022 11:30,Craven Cottage,Fulham,Liverpool,2 - 2,2,2,1.0,1.0


In [7]:
data = get_rank_difficulty(data)
data['points_df'].head(2)

Unnamed: 0,club,games,points,difficulty,rank
0,Arsenal,20,50.0,20.0,1.0
1,Man City,21,45.0,19.0,2.0


In [8]:
data['points_df']

Unnamed: 0,club,games,points,difficulty,rank
0,Arsenal,20,50.0,20.0,1.0
1,Man City,21,45.0,19.0,2.0
2,Man Utd,21,42.0,18.0,3.0
3,Newcastle,21,40.0,17.0,4.0
4,Spurs,22,39.0,16.0,5.0
5,Brighton,20,34.0,15.0,6.0
6,Brentford,21,33.0,14.0,7.0
7,Fulham,22,32.0,13.0,8.0
8,Chelsea,21,30.0,12.0,9.0
9,Liverpool,20,29.0,11.0,10.0


In [None]:
list(set(data['df'][data['df']['home_team'] == club]['away_team']) | set(data['df'][data['df']['away_team'] == club]['home_team']))

In [9]:
for i, club in enumerate(data['clubs_list']):
    yet_to_play = list(set(data['df'][data['df']['home_team'] == club]['away_team']) | set(data['df'][data['df']['away_team'] == club]['home_team']))
    print(club, len(yet_to_play), yet_to_play)
    yet_to_play = [x for x in data['clubs_list'] if x not in yet_to_play]
    print(club, yet_to_play)
#     yet_to_play.remove(club)
#     print(club, yet_to_play)


    remaining_difficulty = sum([int(data['points_df'][data['points_df']['club'] == x]['difficulty']) for x in yet_to_play])    
    yet_to_play_num  = [float(data['points_df'][data['points_df']['club'] == x]['rank']) for x in yet_to_play]   
    matches_remaining = len(yet_to_play_num)
    yet_to_play_num.sort()

    data['points_df'].loc[data['points_df']['club'] == club, 'remaining_difficulty'] = [remaining_difficulty]    
    data['points_df'].loc[data['points_df']['club'] == club, 'yet_to_play_rank'] = str(yet_to_play_num)
    data['points_df'].loc[data['points_df']['club'] == club, 'avg_rem_difficulty'] = int(remaining_difficulty/matches_remaining)
    data['points_df'].loc[data['points_df']['club'] == club, 'yet_to_play'] = str(yet_to_play)

Everton 19 ['Bournemouth', 'Aston Villa', 'Arsenal', 'Brighton', 'Southampton', 'Liverpool', 'Newcastle', 'Brentford', 'West Ham', 'Fulham', 'Wolves', 'Spurs', 'Man Utd', 'Leicester', 'Leeds', 'Chelsea', 'Nottingham Forest', 'Man City', 'Crystal Palace']
Everton ['Everton']
Bournemouth 19 ['Everton', 'Aston Villa', 'Arsenal', 'Brighton', 'Southampton', 'Liverpool', 'Newcastle', 'Brentford', 'West Ham', 'Fulham', 'Wolves', 'Spurs', 'Man Utd', 'Leicester', 'Leeds', 'Chelsea', 'Nottingham Forest', 'Man City', 'Crystal Palace']
Bournemouth ['Bournemouth']
Aston Villa 19 ['Everton', 'Bournemouth', 'Arsenal', 'Brighton', 'Southampton', 'Liverpool', 'Newcastle', 'Brentford', 'West Ham', 'Fulham', 'Wolves', 'Spurs', 'Man Utd', 'Leicester', 'Leeds', 'Chelsea', 'Nottingham Forest', 'Man City', 'Crystal Palace']
Aston Villa ['Aston Villa']
Arsenal 18 ['Everton', 'Spurs', 'Bournemouth', 'Aston Villa', 'Man Utd', 'Leicester', 'Leeds', 'Chelsea', 'Brighton', 'Southampton', 'Liverpool', 'Nottingham F

data = get_remining_difficulty(data)
data['points_df'].head(2)

In [None]:
data['points_df'] = data['points_df'].sort_values(by = ['remaining_difficulty'])
data['points_df'].to_csv('data/results.csv')

In [None]:
data['points_df'] 