In [2]:
import http.client
import json
import os
import pandas as pd
import time
import numpy as np

In [37]:
API_SOCCER = os.environ.get("API_SOCCER")

# Collecting Data

In [38]:
connection = http.client.HTTPConnection('api.football-data.org')
headers = { 'X-Auth-Token': API_SOCCER }

In [39]:
# Eredivisie, Premier League, Bundesliga, Ligue 1, Serie A, Primera Divison
competitions = ['DED', 'PL', 'BL1', 'FL1', 'SA', 'PD']
seasons = [2018,2019,2020]

In [40]:
## Retreiving the data
for comp in competitions:
    for season in seasons:
        connection.request('GET', f"/v2/competitions/{comp}/matches?season={season}&status=FINISHED", None, headers )
        response = json.loads(connection.getresponse().read().decode())
        
        # Creating the empty DataFrame
        matches = pd.DataFrame()
        
        # Appending every match the the newly created DataFrame
        for match in response['matches']:
            matches = matches.append(match, ignore_index=True)
            
        ## Preprocessing
        # Correct type for ID and setting as index
        matches['id'] = matches['id'].astype('int32')
        matches = matches.set_index('id')
        matches.index = matches.index.astype('int64')
        # Remove odds
        matches = matches.drop(columns=['odds'])
        # Include winner as separate column
        matches['winner'] = [d.get('winner') for d in matches.score]
        # Include home team and away team as separate column
        matches['homeTeamName'] = [d.get('name') for d in matches.homeTeam]
        matches['awayTeamName'] = [d.get('name') for d in matches.awayTeam]
        
        ## Saving the data
        matches.to_csv(f"data/{comp}_{season}.csv")
        matches.to_pickle(f"data/{comp}_{season}.pickle")
        
        # Sleep 10 seconds, because we only have 10 calls per minute
        time.sleep(10)

## Combining datasets into a big one
We will first add a column indicating whether or not the date was pre/post corona, and then combine all datasets

In [1]:
# Eredivisie, Premier League, Bundesliga, Ligue 1, Serie A, Primera Divison
competitions = ['DED', 'PL', 'BL1', 'FL1', 'SA', 'PD']
seasons = [2018]

In [4]:
for comp in competitions:
    for season in seasons:
        df = pd.read_pickle(f"data/{comp}_{season}.pickle")
        df["corona"] = "pre"
        df.to_pickle(f"data/{comp}_{season}.pickle")

In [7]:
# Eredivisie, Premier League, Bundesliga, Ligue 1, Serie A, Primera Divison
competitions = ['DED', 'PL', 'BL1', 'FL1', 'SA', 'PD']
seasons = [2019]

In [10]:
# Dates manually looked up on worldfootball.net

for comp in competitions:
    for season in seasons:
        df = pd.read_pickle(f"data/{comp}_{season}.pickle")
        df["utcDate"] = pd.to_datetime(df["utcDate"]).dt.tz_localize(None)
        if comp == 'DED':
            df["corona"] = "pre"
            df.to_pickle(f"data/{comp}_{season}.pickle")
        elif comp == "PL":
            #1st of March, last game with fans
            df["corona"]= np.where(df["utcDate"] <= pd.Timestamp("2020-03-01").floor('D'),"pre","post")
            df.to_pickle(f"data/{comp}_{season}.pickle")
        elif comp == "BL1":
            #8th of March, last game with fans
            df["corona"]= np.where(df["utcDate"] <= pd.Timestamp("2020-03-08").floor('D'),"pre","post")
            df.to_pickle(f"data/{comp}_{season}.pickle")
        elif comp == "FL1":
            df["corona"] = "pre"
            df.to_pickle(f"data/{comp}_{season}.pickle")
        elif comp == "SA":
            #1st of March, last game with fans
            df["corona"]= np.where(df["utcDate"] <= pd.Timestamp("2020-03-01").floor('D'),"pre","post")
            df.to_pickle(f"data/{comp}_{season}.pickle")
        elif comp == "PD":
            #8th of March, last game with fans
            df["corona"]= np.where(df["utcDate"] <= pd.Timestamp("2020-03-08").floor('D'),"pre","post")            
            df.to_pickle(f"data/{comp}_{season}.pickle")
                

In [5]:
# Eredivisie, Premier League, Bundesliga, Ligue 1, Serie A, Primera Divison
competitions = ['DED', 'PL', 'BL1', 'FL1', 'SA', 'PD']
seasons = [2020]

In [6]:
for comp in competitions:
    for season in seasons:
        df = pd.read_pickle(f"data/{comp}_{season}.pickle")
        df["corona"] = "post"
        df.to_pickle(f"data/{comp}_{season}.pickle")

In [4]:
# Eredivisie, Premier League, Bundesliga, Ligue 1, Serie A, Primera Divison
competitions = ['DED', 'PL', 'BL1', 'FL1', 'SA', 'PD']
seasons = [2018,2019,2020]

In [5]:
final_df = pd.DataFrame()

for comp in competitions:
    for season in seasons:
        df = pd.read_pickle(f"data/{comp}_{season}.pickle")
        df["utcDate"] = pd.to_datetime(df["utcDate"]).dt.tz_localize(None)
        df["league"] = comp
        df["year"] = season
        final_df = final_df.append(df)
        
final_df.to_pickle("data/all_data.pickle")
final_df.to_csv("data/all_data.csv")

In [6]:
final_df

Unnamed: 0_level_0,awayTeam,group,homeTeam,lastUpdated,matchday,referees,score,season,stage,status,utcDate,winner,homeTeamName,awayTeamName,corona,league,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
235183,"{'id': 673, 'name': 'SC Heerenveen'}",Regular Season,"{'id': 684, 'name': 'PEC Zwolle'}",2019-04-13T17:30:48Z,1,"[{'id': 43861, 'name': 'Dennis Higler', 'natio...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 156, 'startDate': '2018-08-10', 'endDat...",REGULAR_SEASON,FINISHED,2018-08-10 18:00:00,AWAY_TEAM,PEC Zwolle,SC Heerenveen,pre,DED,2018
235184,"{'id': 671, 'name': 'Heracles Almelo'}",Regular Season,"{'id': 678, 'name': 'AFC Ajax'}",2019-04-13T17:30:48Z,1,"[{'id': 43858, 'name': 'Serdar Gözübüyük', 'na...","{'winner': 'DRAW', 'duration': 'REGULAR', 'ful...","{'id': 156, 'startDate': '2018-08-10', 'endDat...",REGULAR_SEASON,FINISHED,2018-08-11 16:30:00,DRAW,AFC Ajax,Heracles Almelo,pre,DED,2018
235185,"{'id': 668, 'name': 'VVV Venlo'}",Regular Season,"{'id': 672, 'name': 'Willem II Tilburg'}",2019-04-13T17:30:48Z,1,"[{'id': 17141, 'name': 'Rob Dieperink', 'natio...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 156, 'startDate': '2018-08-10', 'endDat...",REGULAR_SEASON,FINISHED,2018-08-11 16:30:00,AWAY_TEAM,Willem II Tilburg,VVV Venlo,pre,DED,2018
235186,"{'id': 676, 'name': 'FC Utrecht'}",Regular Season,"{'id': 674, 'name': 'PSV'}",2019-04-13T17:30:48Z,1,"[{'id': 56905, 'name': 'Bas Nijhuis', 'nationa...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 156, 'startDate': '2018-08-10', 'endDat...",REGULAR_SEASON,FINISHED,2018-08-11 18:45:00,HOME_TEAM,PSV,FC Utrecht,pre,DED,2018
235187,"{'id': 1920, 'name': 'Fortuna Sittard'}",Regular Season,"{'id': 670, 'name': 'SBV Excelsior'}",2019-04-13T17:30:48Z,1,"[{'id': 56902, 'name': 'Siemen Mulder', 'natio...","{'winner': 'DRAW', 'duration': 'REGULAR', 'ful...","{'id': 156, 'startDate': '2018-08-10', 'endDat...",REGULAR_SEASON,FINISHED,2018-08-11 18:45:00,DRAW,SBV Excelsior,Fortuna Sittard,pre,DED,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308408,"{'id': 94, 'name': 'Villarreal CF'}",Regular Season,"{'id': 82, 'name': 'Getafe CF'}",2020-11-14T08:33:17Z,9,"[{'id': 58184, 'name': 'José Luis Munuera', 'n...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 635, 'startDate': '2020-09-13', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 13:00:00,AWAY_TEAM,Getafe CF,Villarreal CF,post,PD,2020
308409,"{'id': 83, 'name': 'Granada CF'}",Regular Season,"{'id': 92, 'name': 'Real Sociedad de Fútbol'}",2020-11-14T08:33:17Z,9,"[{'id': 43843, 'name': 'Carlos del Cerro', 'na...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 635, 'startDate': '2020-09-13', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 15:15:00,HOME_TEAM,Real Sociedad de Fútbol,Granada CF,post,PD,2020
308406,"{'id': 263, 'name': 'Deportivo Alavés'}",Regular Season,"{'id': 88, 'name': 'Levante UD'}",2020-11-14T08:33:17Z,9,"[{'id': 57930, 'name': 'Isidro Díaz de Mera', ...","{'winner': 'DRAW', 'duration': 'REGULAR', 'ful...","{'id': 635, 'startDate': '2020-09-13', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 17:30:00,DRAW,Levante UD,Deportivo Alavés,post,PD,2020
308410,"{'id': 77, 'name': 'Athletic Club'}",Regular Season,"{'id': 250, 'name': 'Real Valladolid CF'}",2020-11-14T08:33:17Z,9,"[{'id': 32457, 'name': 'César Soto', 'national...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 635, 'startDate': '2020-09-13', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 17:30:00,HOME_TEAM,Real Valladolid CF,Athletic Club,post,PD,2020


In [7]:
year = 2020
league = 'PL'

In [9]:
final_df[(final_df['year'] == year) & (final_df['league'] == league)]

Unnamed: 0_level_0,awayTeam,group,homeTeam,lastUpdated,matchday,referees,score,season,stage,status,utcDate,winner,homeTeamName,awayTeamName,corona,league,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
303759,"{'id': 57, 'name': 'Arsenal FC'}",Regular Season,"{'id': 63, 'name': 'Fulham FC'}",2020-09-13T00:08:13Z,1,"[{'id': 11443, 'name': 'Chris Kavanagh', 'nati...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-09-12 11:30:00,AWAY_TEAM,Fulham FC,Arsenal FC,post,PL,2020
303764,"{'id': 340, 'name': 'Southampton FC'}",Regular Season,"{'id': 354, 'name': 'Crystal Palace FC'}",2020-09-12T23:59:45Z,1,"[{'id': 11567, 'name': 'Jonathan Moss', 'natio...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-09-12 14:00:00,HOME_TEAM,Crystal Palace FC,Southampton FC,post,PL,2020
303760,"{'id': 341, 'name': 'Leeds United FC'}",Regular Season,"{'id': 64, 'name': 'Liverpool FC'}",2020-09-12T23:59:45Z,1,"[{'id': 11605, 'name': 'Michael Oliver', 'nati...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-09-12 16:30:00,HOME_TEAM,Liverpool FC,Leeds United FC,post,PL,2020
303763,"{'id': 67, 'name': 'Newcastle United FC'}",Regular Season,"{'id': 563, 'name': 'West Ham United FC'}",2020-09-13T18:34:40Z,1,"[{'id': 11494, 'name': 'Stuart Attwell', 'nati...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-09-12 19:00:00,AWAY_TEAM,West Ham United FC,Newcastle United FC,post,PL,2020
303762,"{'id': 338, 'name': 'Leicester City FC'}",Regular Season,"{'id': 74, 'name': 'West Bromwich Albion FC'}",2020-09-13T23:59:30Z,1,"[{'id': 11580, 'name': 'Anthony Taylor', 'nati...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-09-13 13:00:00,AWAY_TEAM,West Bromwich Albion FC,Leicester City FC,post,PL,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303833,"{'id': 63, 'name': 'Fulham FC'}",Regular Season,"{'id': 563, 'name': 'West Ham United FC'}",2020-11-08T03:10:25Z,8,"[{'id': 11446, 'name': 'Robert Jones', 'nation...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-07 20:00:00,HOME_TEAM,West Ham United FC,Fulham FC,post,PL,2020
303831,"{'id': 73, 'name': 'Tottenham Hotspur FC'}",Regular Season,"{'id': 74, 'name': 'West Bromwich Albion FC'}",2020-11-08T18:58:30Z,8,"[{'id': 11423, 'name': 'Andy Madley', 'nationa...","{'winner': 'AWAY_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 12:00:00,AWAY_TEAM,West Bromwich Albion FC,Tottenham Hotspur FC,post,PL,2020
303832,"{'id': 76, 'name': 'Wolverhampton Wanderers FC'}",Regular Season,"{'id': 338, 'name': 'Leicester City FC'}",2020-11-08T19:36:42Z,8,"[{'id': 11580, 'name': 'Anthony Taylor', 'nati...","{'winner': 'HOME_TEAM', 'duration': 'REGULAR',...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 14:00:00,HOME_TEAM,Leicester City FC,Wolverhampton Wanderers FC,post,PL,2020
303827,"{'id': 64, 'name': 'Liverpool FC'}",Regular Season,"{'id': 65, 'name': 'Manchester City FC'}",2020-11-08T21:48:51Z,8,"[{'id': 11585, 'name': 'Craig Pawson', 'nation...","{'winner': 'DRAW', 'duration': 'REGULAR', 'ful...","{'id': 619, 'startDate': '2020-09-12', 'endDat...",REGULAR_SEASON,FINISHED,2020-11-08 16:30:00,DRAW,Manchester City FC,Liverpool FC,post,PL,2020
