# Get points of team before date

The idea of this is to create a table, where rows are dates which cover every day of the season and columns represent each team. The values will represent the number of league points each team has at each date.

In [1]:
# import packages
import pandas as pd
import numpy as np
import datetime
import math
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
# Read fixtures
fixtures = pd.read_csv('../data/csv/fixtures.csv')
fixtures['kickoff_time'] = pd.to_datetime(fixtures['kickoff_time'], utc=True)
fixtures.head()

Unnamed: 0,event,kickoff_time,team_h,team_a,team_h_score,team_a_score,team_h_points,team_a_points
0,1,2018-08-12 15:00:00+00:00,0,12,0,2,0,3
1,1,2018-08-11 14:00:00+00:00,1,4,2,0,3,0
2,1,2018-08-11 14:00:00+00:00,8,6,0,2,0,3
3,1,2018-08-11 14:00:00+00:00,9,5,0,3,0,3
4,1,2018-08-12 12:30:00+00:00,11,18,4,0,3,0


In [3]:
# Read teams
teams = pd.read_csv('../data/csv/teams.csv')
teams.head()

Unnamed: 0,name,short_name,strength,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away,strength_overall_home,strength_overall_away
0,Arsenal,ARS,4,1240,1270,1310,1340,1260,1320
1,Bournemouth,BOU,3,1040,1100,1120,1130,1030,1130
2,Brighton,BHA,2,1040,1140,1010,1070,1030,1050
3,Burnley,BUR,3,990,1030,1000,1040,1070,1100
4,Cardiff,CAR,2,1030,1060,1020,1090,1030,1080


In [4]:
# Find dates of first and last days of the season

# Subtract one day from start of the season
first_day = np.min(fixtures['kickoff_time']).replace(hour=0, minute=0, second=0) - datetime.timedelta(days=1)

# Add one day to end of season
last_day = np.max(fixtures['kickoff_time']).replace(hour=0, minute=0, second=0) + datetime.timedelta(days=1)

season_length = (last_day - first_day).days

print('First day of the season:       ' + first_day.strftime('%Y-%m-%d'))
print('Last day of the season:        ' + last_day.strftime('%Y-%m-%d'))
print('Days between first and last:   ' + str(season_length))


First day of the season:       2018-08-09
Last day of the season:        2019-05-13
Days between first and last:   277


In [5]:
# Create array of dates
date_table = pd.DataFrame(pd.date_range(first_day, last_day, season_length+1), columns=['date'])
date_dict = dict(date_table)
date_table.head()

Unnamed: 0,date
0,2018-08-09 00:00:00+00:00
1,2018-08-10 00:00:00+00:00
2,2018-08-11 00:00:00+00:00
3,2018-08-12 00:00:00+00:00
4,2018-08-13 00:00:00+00:00


In total, there will be three tables which represent
1. Total number of points
2. Total number of HOME points
3. Total number of AWAY points

as a function of date.

In [6]:
# Calculate the response of the calculation by weighted more recent games heavier than older ones.
# The standard deviation pf the gaussian can be tuned.

def weightedMeanByDate(point_store, date_store, use_weights, sigma, plotBool):
    if all(v is math.nan for v in point_store):
        # If all points are nan, just return 0
        y = 0
    else:
        if use_weights:
            # more recent results are weighted heavier according to gaussian distribution
            date_store = pd.to_datetime(date_store)
            new_dates = (date_store - date_store.iloc[-1]).dt.days
            response = (1/np.sqrt(2*math.pi*(sigma**2)))*np.exp(-(new_dates**2)/(2*(sigma**2)))
            response = response/np.sum(response)
            if plotBool:
                plt.plot(new_dates, response/np.max(response), 'o-')
                plt.xlabel('day since last game')
                plt.ylabel('relative weighting')
                plt.show()
            y = np.sum(point_store*response)
        else:
            y = np.nanmean(point_store)
    return y


# Demo of weighted mean by date.
sigma = 7
date_store_demo = pd.Series([\
                             datetime(year=2018, month=1, day=1), \
                             datetime(year=2018, month=1, day=6), \
                             datetime(year=2018, month=1, day=11)])
point_store_demo = pd.Series([1,2,3])
l = weightedMeanByDate(point_store_demo, date_store_demo, True, sigma, True)
print(l)

TypeError: 'module' object is not callable

In [None]:
# Show response as a function of days since previous fixture
# How would a full response curve of this sigma look?
days = np.linspace(-14,0,15)
days_response = (1/np.sqrt(2*math.pi*(sigma**2)))*np.exp(-(days**2)/(2*(sigma**2)))
days_response = days_response/np.max(days_response)
plt.plot(days, days_response, '-o')
plt.xlabel('days since previous fixtures')
plt.ylabel('relative weighting')
plt.xticks(days)
plt.show()

In [None]:
# Calculate loop parameters

# Get number of teams
n_teams = len(teams)

# Get number of fixtures
n_fixtures = len(fixtures)

# Running window size
wsz = 3

# window response
sigma = 7 # Setting standard deviation of weighting to one week
# This means the relative weighting of points compared to the most recent fixture is about 61% after 7 days

# Empty array for dates
date_table_running_points = date_table
date_table_running_points_h = date_table
date_table_running_points_a = date_table
date_table_last_n_points = date_table
date_table_last_n_points_h = date_table
date_table_last_n_points_a = date_table


In [None]:
# Loop through teams
for i in range(0,20):
    # 20 teams (fixed)
    cTeam = str(teams['short_name'][i])
    
    # Points from game on this date
    team_points_per_game = pd.Series(np.zeros(38))
    
    # Running totals
    team_points_running = pd.Series(np.zeros(38))
    team_h_points_running = pd.Series(np.zeros(38))
    team_a_points_running = pd.Series(np.zeros(38))
    
    # Running totals
    team_points_mean_last_n = pd.Series(np.zeros(38))
    team_h_points_mean_last_n = pd.Series(np.zeros(38))
    team_a_points_mean_last_n = pd.Series(np.zeros(38))
    
    # Series to store points and dates of all/home/away games
    team_points_store = pd.Series([np.nan]*wsz)
    team_h_points_store = pd.Series([np.nan]*wsz)
    team_a_points_store = pd.Series([np.nan]*wsz)
    team_dates_store = pd.Series([np.nan]*wsz)
    team_h_dates_store = pd.Series([np.nan]*wsz)
    team_a_dates_store = pd.Series([np.nan]*wsz)
    
    # Find all fixtures for this team (home and away)
    team_fixtures = fixtures[np.logical_or(fixtures['team_h']==i, fixtures['team_a']==i)]
    team_fixtures = team_fixtures.reset_index(drop=True)
    
    # Loop through fixtures for this team
    for j in range(0,38):
        # 38 fixtures (fixed)
        
        # Calculate weighted mean from previously stored results
        team_points_mean_last_n[j] = weightedMeanByDate(team_points_store, team_dates_store, True, sigma, False)
        team_h_points_mean_last_n[j] = weightedMeanByDate(team_h_points_store, team_h_dates_store, True, sigma, False)
        team_a_points_mean_last_n[j] = weightedMeanByDate(team_a_points_store, team_a_dates_store, True, sigma, False)
        
        
        # Two cases: Home and Away
        if team_fixtures['team_h'][j]==i:
            # if team is at home
            
            # manipulate store of home game points
            # push older points and dates to the start of array, insert new points/date at the end of array
            for l in range(0,(wsz-1)):
                team_h_points_store[l] = team_h_points_store[l+1]
                team_h_dates_store[l] = team_h_dates_store[l+1]
            team_h_points_store[(wsz-1)] = team_fixtures['team_h_points'][j]
            team_h_dates_store[(wsz-1)] = team_fixtures['kickoff_time'][j]
            
            # Add to running points total
            team_points_per_game[j] = team_fixtures['team_h_points'][j]
            
            # Calculate running home points total
            if j==0:
                # if first game of the season
                team_h_points_running[j] = team_fixtures['team_h_points'][j]
            else:
                # if NOT first game of the season
                team_a_points_running[j] = team_a_points_running[j-1]
                team_h_points_running[j] = team_h_points_running[j-1] + team_fixtures['team_h_points'][j]
                
        else:
            # if team is away
            
            # manipulate store of home game points
            # push older points and dates to the start of array, insert new points/date at the end of array
            for l in range(0,(wsz-1)):
                team_a_points_store[l] = team_a_points_store[l+1]
                team_a_dates_store[l] = team_a_dates_store[l+1]
            team_a_points_store[(wsz-1)] = team_fixtures['team_a_points'][j]
            team_a_dates_store[(wsz-1)] = team_fixtures['kickoff_time'][j]
            
            # Add to running points total
            team_points_per_game[j] = team_fixtures['team_a_points'][j]
            
            if j==0:
                # if first game of the season
                team_a_points_running[j] = team_fixtures['team_a_points'][j]
            else:
                # if NOT first game of the season
                team_a_points_running[j] = team_a_points_running[j-1] + team_fixtures['team_a_points'][j]
                team_h_points_running[j] = team_h_points_running[j-1]
                
        
        # manipulate store of home game points
        # push older points and dates to the start of array, insert new points/date at the end of array
        for l in range(0,(wsz-1)):
            team_points_store[l] = team_points_store[l+1]
            team_dates_store[l] = team_dates_store[l+1]
        team_points_store[(wsz-1)] = team_points_per_game[j]
        team_dates_store[(wsz-1)] = team_fixtures['kickoff_time'][j]        
            
            
        # Add running points (total, home and away) to dataframe
        team_fixtures['team_points_per_game'] = team_points_per_game
        
        team_points_running = np.array(team_h_points_running) + np.array(team_a_points_running)
        team_fixtures['team_points_running']  = team_points_running
        
        team_fixtures['team_h_points_running'] = team_h_points_running
        team_fixtures['team_a_points_running'] = team_a_points_running
        
        team_fixtures['team_points_mean_last_n'] = team_points_mean_last_n
        team_fixtures['team_h_points_mean_last_n'] = team_h_points_mean_last_n
        team_fixtures['team_a_points_mean_last_n'] = team_a_points_mean_last_n
    
    
    # This table now must be interpolated the cover every day in the football season
    date_table_running_points_tmp = np.zeros(len(date_table))
    date_table_running_points_h_tmp = np.zeros(len(date_table))
    date_table_running_points_a_tmp = np.zeros(len(date_table))
    date_table_last_n_points_tmp = np.zeros(len(date_table))
    date_table_last_n_points_h_tmp = np.zeros(len(date_table))
    date_table_last_n_points_a_tmp = np.zeros(len(date_table))
    
    for m in range(0, len(date_table)):
        a = np.sum(team_fixtures['kickoff_time'] <= date_table['date'][m])
        if a>37:
            a = 37
        date_table_running_points_tmp[m] = team_fixtures['team_points_running'][a]
        date_table_running_points_h_tmp[m] = team_fixtures['team_h_points_running'][a]
        date_table_running_points_a_tmp[m] = team_fixtures['team_a_points_running'][a]
        date_table_last_n_points_tmp[m] = team_fixtures['team_points_mean_last_n'][a]
        date_table_last_n_points_h_tmp[m] = team_fixtures['team_h_points_mean_last_n'][a]
        date_table_last_n_points_a_tmp[m] = team_fixtures['team_h_points_mean_last_n'][a]
        
        
    date_table_running_points[cTeam] = pd.Series(date_table_running_points_tmp)
    date_table_running_points_h[cTeam] = pd.Series(date_table_running_points_h_tmp)
    date_table_running_points_a[cTeam] = pd.Series(date_table_running_points_a_tmp)
    date_table_last_n_points[cTeam] = pd.Series(date_table_last_n_points_tmp)
    date_table_last_n_points_h[cTeam] = pd.Series(date_table_last_n_points_h_tmp)
    date_table_last_n_points_a[cTeam] = pd.Series(date_table_last_n_points_a_tmp)
    

In [None]:
date_table_running_points.head(300)
date_table_running_points.to_csv(r'../data/csv/teams_running_points_by_date.csv')

In [None]:
date_table_running_points_h.head(300)
date_table_running_points_h.to_csv(r'../data/csv/teams_running_points_h_by_date.csv')