In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
from datetime import datetime

from dotenv import load_dotenv

import xlsxwriter

# import requests
# import json

# current date
date = datetime.now().strftime("%Y%m%d-%H%M")
print(date) 

20230124-0950


In [None]:
# Pull data from historic data in csv files
# link to data https://www.football-data.co.uk/mmz4281/2223/E0.csv 
# where 2223 is the year and E0 is the league

# read in data
df02223 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2223/E0.csv')
df12223 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2223/E1.csv')
df02122 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2122/E0.csv')
df12122 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2122/E1.csv')
df2021 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2021/E0.csv')
df1920 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1920/E0.csv')
df1819 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1819/E0.csv')
df1718 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1718/E0.csv') 
df1617 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1617/E0.csv')
df1516 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1516/E0.csv')
df1415 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1415/E0.csv')
df1314 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1314/E0.csv')
df1213 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1213/E0.csv')
df1112 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1112/E0.csv')
df1011 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1011/E0.csv')


# create a list of dataframes
df_list = [df02223, df12223, df02122, df12122, df2021, df1920, df1819, df1718, df1617, df1516, df1415, df1314, df1213, df1112, df1011]



### Alternative solution
creating a list of csv files

In [None]:
# create list of seasons from 1993 to current year
# where year is 2 digits
# and the season is the year + the next year
seasons = []
for i in range(1993, 2023):
    seasons.append(str(i)[2:4] + str(i+1)[2:4])

# append the league to the season 
# in to format season/league.csv 
# where the league is E0, E1, E2, E3
# and create a list of seasons and leagues
seasons_leagues = []
for season in seasons:
    for league in ['E0', 'E1', 'E2', 'E3']:
        seasons_leagues.append(season + '/' + league + '.csv')

# add the path to the data "https://www.football-data.co.uk/mmz4281/
# to the list of seasons and leagues
seasons_leagues = ['https://www.football-data.co.uk/mmz4281/' + season_league for season_league in seasons_leagues]


print(seasons_leagues)

In [None]:
# load each of the csv files from seasons_leagues into a dataframe per file


In [None]:
# print unique values in the div column
print(df['Div'].unique())

In [None]:
# concat dataframes into one dataframe
df = pd.concat(df_list, ignore_index=True)
# replace 4 digit year with 2 digit year
df['Date'] = df['Date'].str.replace('2023', '23')
df['Date'] = df['Date'].str.replace('2022', '22')
df['Date'] = df['Date'].str.replace('2021', '21')
df['Date'] = df['Date'].str.replace('2020', '20')
df['Date'] = df['Date'].str.replace('2019', '19')  
df['Date'] = df['Date'].str.replace('2018', '18')
df['Date'] = df['Date'].str.replace('2017', '17')
df['Date'] = df['Date'].str.replace('2016', '16')
df['Date'] = df['Date'].str.replace('2015', '15')
df['Date'] = df['Date'].str.replace('2014', '14')
df['Date'] = df['Date'].str.replace('2013', '13')
df['Date'] = df['Date'].str.replace('2012', '12')
df['Date'] = df['Date'].str.replace('2011', '11')


df.head()

In [None]:
# change values in Div column 
df['Div'] = df['Div'].str.replace('E0', 'Premier League')
df['Div'] = df['Div'].str.replace('E1', 'Championship')
df['Div'] = df['Div'].str.replace('E2', 'League One')
df['Div'] = df['Div'].str.replace('E3', 'League Two')

# convert date column to datetime with multiple formats
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')

# sort by date and time
df = df.sort_values(by=['Date', 'Time'], ascending=True)
# reindex
df = df.reset_index(drop=True)
# add id column
df['id'] = df.index


print(df.shape)
df.head()

In [None]:
# export to excel
# df.to_excel(f'../../output/{date}-historic.xlsx', index=False)

In [None]:

points_map = {
    'W': 3,
    'D': 1,
    'L': 0
}

# Results Function
def get_result(score, score_opp):
    if score == score_opp:
        return 'D'
    elif score > score_opp:
        return 'W'
    else:
        return 'L'

In [None]:
df_matches['Opponent'] = np.where(
    df_matches['home_away'] == 'H', 
    df_matches['AwayTeam'], 
    df_matches['HomeTeam']
    )

# create a column for goals scored
df_matches['goals_scored'] = np.where(
    df_matches['home_away'] == 'H',
    df_matches['FTHG'],
    df_matches['FTAG']
    )
# create a column for goals conceded
df_matches['goals_conceded'] = np.where(
    df_matches['home_away'] == 'H',
    df_matches['FTAG'],
    df_matches['FTHG']
    )

# create a column for result
df_matches['result'] = np.vectorize(get_result)(
    df_matches['goals_scored'], df_matches['goals_conceded'])

# create a column for points
df_matches['points'] = df_matches['result'].map(points_map)

# sort by date descending and id ascending
df_matches = df_matches.sort_values(by=['Date', 'id'], ascending=[False, True])

# add a season column, season starts in August
df_matches['season'] = np.where(
    df_matches['Date'].dt.month >= 8,
    df_matches['Date'].dt.year,
    df_matches['Date'].dt.year - 1
    )

# reset index
df_matches = df_matches.reset_index(drop=True)

df_matches.head(30)


In [None]:

points_map = {
    'W': 3,
    'D': 1,
    'L': 0
}

# Results Function
def get_result(score, score_opp):
    if score == score_opp:
        return 'D'
    elif score > score_opp:
        return 'W'
    else:
        return 'L'

In [None]:
df_matches['Opponent'] = np.where(
    df_matches['home_away'] == 'H', 
    df_matches['AwayTeam'], 
    df_matches['HomeTeam']
    )

# create a column for goals scored
df_matches['goals_scored'] = np.where(
    df_matches['home_away'] == 'H',
    df_matches['FTHG'],
    df_matches['FTAG']
    )
# create a column for goals conceded
df_matches['goals_conceded'] = np.where(
    df_matches['home_away'] == 'H',
    df_matches['FTAG'],
    df_matches['FTHG']
    )

# create a column for result
df_matches['result'] = np.vectorize(get_result)(
    df_matches['goals_scored'], df_matches['goals_conceded'])

# create a column for points
df_matches['points'] = df_matches['result'].map(points_map)

# sort by date descending and id ascending
df_matches = df_matches.sort_values(by=['Date', 'id'], ascending=[False, True])

# add a season column, season starts in August
df_matches['season'] = np.where(
    df_matches['Date'].dt.month >= 8,
    df_matches['Date'].dt.year,
    df_matches['Date'].dt.year - 1
    )

# reset index
df_matches = df_matches.reset_index(drop=True)

df_matches.head(30)
