In [1]:
# load relevant libraries
import numpy as np
import pandas as pd

In [2]:
# create pandas from json files
teams_raw = pd.read_json('data/teams.json')
matches_raw = pd.read_json('data/matches_England.json')
events_raw = pd.read_json('data/events_England.json')
players_raw = pd.read_json('data/players.json', encoding='unicode-escape')
ranks_raw = pd.read_json('data/player_rank.json')
epl_stats = pd.read_csv('data/epl_stats.csv')

## In order to make the dataset size more manageable we will focus on the English Premier League. This includes 20 teams from the United Kingdom. The UK includes England, Wales, Northern Ireland, and Scotland.

In [3]:
# copy teams_raw to teams to start editing
teams = teams_raw.copy()

# rename some columns for clarity
teams.rename(columns={"wyId": "teamId"}, inplace=True)

# add country column to teams dataframe
teams['country'] = teams.area.map(lambda v:v['name'])
teams['countryId'] = teams.area.map(lambda v:v['id'])
teams['countryId'] = teams.countryId.astype('int64')

# drop redundant column 'area'
teams.drop(['area'], axis=1, inplace=True)

# filter teams data frame to only include countries in the UK, countryID is 0
uk_teams = teams[(teams.countryId == 0)]

# filter out national teams by only including club
premier = uk_teams[uk_teams.type == 'club']

# add stats to the teams table
premier = premier.merge(epl_stats, on='name')
# change position column name to be more consistent with other columns
premier.rename(columns={'Position':'position'}, inplace=True)

# set index as teamId
premier.set_index('teamId', inplace=True)

# sort by teamId
premier.sort_index(inplace=True)

## Now lets make some adjustments to the players data frame and then filter out players who aren't in the premier league

In [121]:
# copy players_raw to teams to start editing
players = players_raw.copy()

# rename some columns for clarity
players = players.rename(columns={"wyId": "playerId"})

# add position column to players dataframe
players['position'] = players.role.map(lambda v:v['code2'])

# convert currentTeamId to numeric, and coerce 'null' strings to Nan
players['currentTeamId'] = pd.to_numeric(players.currentTeamId, errors='coerce')
# convert birthDate to datetime object
players['birthDate'] = pd.to_datetime(players.birthDate)
# replace 0 values in height and weight with Nan
players['weight'] = players.weight.replace(0, np.nan)
players['height'] = players.height.replace(0, np.nan)
players['foot'] = players.foot.replace(['', 'null'], np.nan)

# drop unnecessary columns
players = players.drop(['passportArea', 'middleName', 'role', 'birthArea', 'currentNationalTeamId'], axis=1)

# make list of uk teams
uk_teams = premier.index.tolist()
# filter only uk players
uk_players = players[players.currentTeamId.isin(uk_teams)]

# Kyle Taylor shows as the only player with missing data now so lets fill it in manually
# with data found here: https://www.fifaindex.com/player/239676/kyle-taylor/fifa20/
# get the index for Kyle Taylor
KT_idx = uk_players.index[(uk_players.firstName == 'Kyle') & (uk_players.lastName == 'Taylor')][0]
uk_players.loc[KT_idx,['weight', 'height', 'foot']] = [70, 178, 'right']

# keeping only UK players removed players with currentTeamId = Nan so lets convert to integer
uk_players['currentTeamId'] = uk_players['currentTeamId'].astype('int64')

# set index as playerId
uk_players = uk_players.set_index('playerId')

# sort by playerId
uk_players.sort_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Let's clean up the matches data frame. This was imported from a json which was already filtered to the premier league so no filtering is required here.

In [5]:
# function for determining the matche outcome
def outcome(row):
    # given a row, who won/lost/draw
    # if the winner matches teamId then they won the match
    if row['winner'] == row['teamId']:
        return 'W'
    # if the winner is 0 then there was no winner and the outome is 'draw'
    if row['winner'] == 0:
        return 'D'
    # if the team didn't win or draw then they lost
    return 'L'

# select desired columns for matches dataframe
matches = matches_raw.loc[:,['wyId', 'teamsData', 'dateutc', 'winner']]

# rename some columns for clarity
matches.rename(columns={"wyId": "matchId"}, inplace=True)

# initialize lists for building series
team1_list = []
team1_home = []
team1_scores = []
team2_list = []
team2_scores = []

# iterate through each row in dataframe
for index, row in matches.iterrows():
    # find dict keys
    keys = list(row['teamsData'].keys())
    # assign keys to team 1 and team 2
    team1 = row['teamsData'][keys[0]]
    team2 = row['teamsData'][keys[1]]
    
    # build lists for each team of relevant info
    team1_list.append(team1['teamId'])
    team1_scores.append(team1['score'])
    team2_list.append(team2['teamId'])
    team2_scores.append(team2['score'])
    
    # determine home team
    if team1['side'] == 'home':
        team1_home.append(1)
    else:
        team1_home.append(0)

# create columns from lists generated above
matches['team1'] = team1_list
matches['team1_scores'] = team1_scores
matches['team1_home'] = team1_home
matches['team2'] = team2_list
matches['team2_scores'] = team2_scores

# remove redundant column
matches.drop(['teamsData'], axis=1, inplace=True)

# melt matches by team1/team2
melt_teamId = pd.melt(matches, id_vars=['matchId', 'dateutc', 'team1_home'],value_vars=['team1', 'team2'], value_name='teamId')
# melt matches by team1 and team 2 scores
melt_goals = pd.melt(matches, id_vars=['matchId', 'winner'], value_vars=['team1_scores', 'team2_scores'], value_name='goals')

#merge the melted df's together by index
melted_matches = pd.merge(melt_teamId, melt_goals, left_index=True, right_index=True)

# add column 'outcome' to track if the team won, lost or had a draw
melted_matches['outcome'] = melted_matches.apply(lambda row: outcome(row), axis=1)
# add column 'home' with boolean for whether or not the team was the home team
melted_matches['home'] = ((melted_matches.team1_home == 1) & (melted_matches.variable_x == 'team1')
                         | ((melted_matches.team1_home == 0) & (melted_matches.variable_x == 'team2')))

# rename column 'matchId_x' to 'matchId'
melted_matches.rename(columns={'matchId_x':'matchId'}, inplace=True)
# drop unnecessary columns
melted_matches.drop(['team1_home', 'variable_x', 'matchId_y', 'winner', 'variable_y'], axis=1, inplace=True)
# set the index by matchId and teamId
melted_matches.set_index(['matchId', 'teamId'], inplace=True)
# sort by index
melted_matches.sort_index(inplace=True)

## The player rank is already pretty clean so let's just filter out players who aren't in the premier league.

In [115]:
# copy ranks_raw to ranks to start editing
ranks = ranks_raw.copy()

# make list of uk players
players_list = uk_players.index.tolist()
# filter only uk players
uk_ranks = ranks[ranks.playerId.isin(players_list)]

# set index as playerId and matchId
uk_ranks = uk_ranks.set_index(['playerId', 'matchId'])

# sort by matchId
uk_ranks.sort_index(inplace=True)

## Let's clean up the events data set focusing on the tags and the xy positions.

In [265]:
# the following link is very helpful in understanding the different event types and tags
# https://footballdata.wyscout.com/events-manual/

# copy ranks_raw to ranks to start editing
events = events_raw.copy()

# split positions column into start and end position colums, then drop original column
events[['start_pos', 'end_pos']] = pd.DataFrame(events.positions.values.tolist(), index=events.index)

# fill rows where end pos is None with 0, 0 dict to allow for splitting in next step without error
# This occurs when there are certain fouls or protests
events['end_pos'] = events['end_pos'].apply(lambda x: {'y': 0, 'x': 0} if x is None else x)

# replace empty space in the following columns with the specified values using regex
events.subEventName.replace(r'^\s*$', 'Offside', regex=True, inplace=True)
events.subEventId.replace(r'^\s*$', '60', regex=True, inplace=True)

# split start_pos and end_pos to start_x, start_y, end_x, end_y
events[['start_y', 'start_x']] = pd.DataFrame(events.start_pos.values.tolist(), index=events.index)
events[['end_y', 'end_x']] = pd.DataFrame(events.end_pos.values.tolist(), index=events.index)

# remove columns which are no longer needed
events.drop(['positions', 'start_pos', 'end_pos'], axis=1, inplace=True)

# re-arrange the columns so features of higher interest are toward left
events = events.loc[:, ['matchId', 'matchPeriod', 'id', 'eventSec', 'start_x', 'start_y', 'end_x', 'end_y', 
                        'eventId','eventName', 'subEventId', 'subEventName', 'teamId', 'playerId', 'tags']]

# create new tags column with list instead of dictionary
tag_list = []
for index, row in events.iterrows():
    tag_list.append([d['id'] for d in row['tags']])

# overwrite the tags column with list
events['tags'] = tag_list

# set index as matchId and id
#events.set_index(['matchId', 'matchPeriod'], inplace=True)

# sort by matchId
#events.sort_index(inplace=True)

# add column showing the time passed between each event
events['interval'] = events.groupby(['matchId', 'matchPeriod'])['eventSec'].diff().fillna(events['eventSec'])



In [266]:
# convert all event tags into features with either 0 or 1 values
events['goal'] = [1 if 101 in x else 0 for x in events['tags']]
events['ownGoal'] = [1 if 102 in x else 0 for x in events['tags']]
events['opportunity'] = [1 if 201 in x else 0 for x in events['tags']]

events['assist'] = [1 if 301 in x else 0 for x in events.tags]
events['keyPass'] = [1 if 302 in x else 0 for x in events['tags']]

events['leftFoot'] = [1 if 401 in x else 0 for x in events['tags']]
events['rightFoot'] = [1 if 402 in x else 0 for x in events['tags']]
events['head_body'] = [1 if 403 in x else 0 for x in events['tags']]

events['leftSpace'] = [1 if 501 in x else 0 for x in events['tags']]
events['rightSpace'] = [1 if 502 in x else 0 for x in events['tags']]
# a take is when a player is trying to get past another either by dribbling or passing the ball
# 1 is left, 2 is right, 0 is n/a
events['leftTake'] = [1 if 503 in x else 0 for x in events['tags']]
events['rightTake'] = [1 if 504 in x else 0 for x in events['tags']]

events['anticipated'] = [1 if 601 in x else 0 for x in events['tags']]
events['anticipation'] = [1 if 602 in x else 0 for x in events['tags']]

events['lostDuel'] = [1 if 701 in x else 0 for x in events['tags']]
events['wonDuel'] = [1 if 703 in x else 0 for x in events['tags']]
events['neutralDuel'] = [1 if 702 in x else 0 for x in events['tags']]

events['high'] = [1 if 801 in x else 0 for x in events['tags']]
events['low'] = [1 if 802 in x else 0 for x in events['tags']]

events['through'] = [1 if 901 in x else 0 for x in events['tags']]
events['fairplay'] = [1 if 1001 in x else 0 for x in events['tags']]

events['direct'] = [1 if 1101 in x else 0 for x in events['tags']]
events['indirect'] = [1 if 1102 in x else 0 for x in events['tags']]

events['goalLowCenter'] = [1 if 1201 in x else 0 for x in events['tags']]
events['goalLowRight'] = [1 if 1202 in x else 0 for x in events['tags']]
events['goalCenter'] = [1 if 1203 in x else 0 for x in events['tags']]
events['goalCenterLeft'] = [1 if 1204 in x else 0 for x in events['tags']]
events['goalLowLeft'] = [1 if 1205 in x else 0 for x in events['tags']]
events['goalCenterRight'] = [1 if 1206 in x else 0 for x in events['tags']]
events['goalHighCenter'] = [1 if 1207 in x else 0 for x in events['tags']]
events['goalHighLeft'] = [1 if 1208 in x else 0 for x in events['tags']]
events['goalHighRight'] = [1 if 1209 in x else 0 for x in events['tags']]

events['outLowRight'] = [1 if 1210 in x else 0 for x in events['tags']]
events['outCenterLeft'] = [1 if 1211 in x else 0 for x in events['tags']]
events['outLowLeft'] = [1 if 1212 in x else 0 for x in events['tags']]
events['outCenterRight'] = [1 if 1213 in x else 0 for x in events['tags']]
events['outHighCenter'] = [1 if 1214 in x else 0 for x in events['tags']]
events['outHighLeft'] = [1 if 1215 in x else 0 for x in events['tags']]
events['outHighRight'] = [1 if 1216 in x else 0 for x in events['tags']]

events['postLowRight'] = [1 if 1217 in x else 0 for x in events['tags']]
events['postCenterLeft'] = [1 if 1218 in x else 0 for x in events['tags']]
events['postLowLeft'] = [1 if 1219 in x else 0 for x in events['tags']]
events['postCenterRight'] = [1 if 1220 in x else 0 for x in events['tags']]
events['postHighCenter'] = [1 if 1221 in x else 0 for x in events['tags']]
events['postHighLeft'] = [1 if 1222 in x else 0 for x in events['tags']]
events['postHighRight'] = [1 if 1223 in x else 0 for x in events['tags']]

events['feint'] = [1 if 1301 in x else 0 for x in events['tags']]
events['missedBall'] = [1 if 1302 in x else 0 for x in events['tags']]

events['interception'] = [1 if 1401 in x else 0 for x in events['tags']]
events['clearance'] = [1 if 1501 in x else 0 for x in events['tags']]
events['slidingTackle'] = [1 if 1601 in x else 0 for x in events['tags']]

events['redCard'] = [1 if 1701 in x else 0 for x in events['tags']]
events['yellowCard'] = [1 if 1702 in x else 0 for x in events['tags']]
events['secondYellowCard'] = [1 if 1703 in x else 0 for x in events['tags']]

events['accurate'] = [1 if 1801 in x else 0 for x in events['tags']]
events['inaccurate'] = [1 if 1802 in x else 0 for x in events['tags']]

events['counterAttack'] = [1 if 1901 in x else 0 for x in events['tags']]
events['dangerousBallLost'] = [1 if 2001 in x else 0 for x in events['tags']]
events['blocked'] = [1 if 2101 in x else 0 for x in events['tags']]








In [None]:
# export cleaned dataframes to csv
premier.to_csv('data/premier_league_teams.csv', index=True, header=True)
# export cleaned dataframe to csv, use utf-8 encoding to match pandas default
uk_players.to_csv('data/premier_league_players.csv', index=True, header=True, encoding='utf-8')
uk_ranks.to_csv('data/premier_league_ranks.csv', index=True, header=True)
melted_matches.to_csv('data/premier_league_matches.csv', index=True, header=True)
events.to_csv('data/premier_league_events.csv', index=True, header=True)
