In [151]:
# imports
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'

In [152]:
# gets data from API
def fetchData(url):
    try:
        r = requests.get(url)
        return r.json()
    except:
        return None

In [153]:
# calls fetchData function for chosen URL
general_info_json = fetchData('https://fantasy.premierleague.com/api/bootstrap-static/')
fixtures_json = fetchData('https://fantasy.premierleague.com/api/fixtures/')
#players_detailed_json = fetchData('https://fantasy.premierleague.com/api/element-summary/{' + element_id + '}/')
#gameweek_live_json = fetchData('https://fantasy.premierleague.com/api/event/{' + event_id + '}/live/')

In [154]:
# gets a list of keys of the json
general_info_json.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [155]:
events_df = pd.DataFrame(general_info_json['events'])

In [156]:
# shows how many values are null for each column
events_df.isnull().sum()

id                            0
name                          0
deadline_time                 0
average_entry_score           0
finished                      0
data_checked                  0
highest_scoring_entry        29
deadline_time_epoch           0
deadline_time_game_offset     0
highest_score                29
is_previous                   0
is_current                    0
is_next                       0
chip_plays                    0
most_selected                29
most_transferred_in          29
top_element                  29
top_element_info             29
transfers_made                0
most_captained               29
most_vice_captained          29
dtype: int64

In [157]:
# creates 2 dataframes with rows split based on whether the value in "finished" is True or False
data_checked = events_df.groupby(events_df.finished)
data_checked_df = data_checked.get_group(True)
data_not_checked_df = data_checked.get_group(False)

# https://www.delftstack.com/howto/python-pandas/split-pandas-dataframe/

In [158]:
data_checked_df.isnull().sum()

id                           0
name                         0
deadline_time                0
average_entry_score          0
finished                     0
data_checked                 0
highest_scoring_entry        0
deadline_time_epoch          0
deadline_time_game_offset    0
highest_score                0
is_previous                  0
is_current                   0
is_next                      0
chip_plays                   0
most_selected                0
most_transferred_in          0
top_element                  0
top_element_info             0
transfers_made               0
most_captained               0
most_vice_captained          0
dtype: int64

No missing values in data_checked_df (all missing values moved to data_not_checked_df)

In [159]:
data_not_checked_df.isnull().sum()

id                            0
name                          0
deadline_time                 0
average_entry_score           0
finished                      0
data_checked                  0
highest_scoring_entry        29
deadline_time_epoch           0
deadline_time_game_offset     0
highest_score                29
is_previous                   0
is_current                    0
is_next                       0
chip_plays                    0
most_selected                29
most_transferred_in          29
top_element                  29
top_element_info             29
transfers_made                0
most_captained               29
most_vice_captained          29
dtype: int64

In [160]:
events_general_info_df = events_df[['id', 'name', 'is_previous', 'is_current', 'is_next', 'finished', 'data_checked']]
events_general_info_df.isnull().sum()

id              0
name            0
is_previous     0
is_current      0
is_next         0
finished        0
data_checked    0
dtype: int64

In [161]:
events_deadline_df = events_df[['id', 'deadline_time', 'deadline_time_epoch', 'deadline_time_game_offset']]
events_general_info_df.isnull().sum()

id              0
name            0
is_previous     0
is_current      0
is_next         0
finished        0
data_checked    0
dtype: int64

In [162]:
events_user_scored_df = events_df[['id', 'average_entry_score', 'highest_scoring_entry', 'highest_score']]
events_user_scored_df.isnull().sum()

id                        0
average_entry_score       0
highest_scoring_entry    29
highest_score            29
dtype: int64

In [163]:
try:
    events_user_scored_df.dropna(inplace=True, axis=0)
except:
    pass
events_user_scored_df.isnull().sum()

id                       0
average_entry_score      0
highest_scoring_entry    0
highest_score            0
dtype: int64

In [164]:
events_user_management_df = events_df[['id', 'chip_plays', 'most_selected', 'most_transferred_in', 'top_element', 'top_element_info', 'transfers_made', 'most_captained', 'most_vice_captained']]
events_user_management_df.isnull().sum()

id                      0
chip_plays              0
most_selected          29
most_transferred_in    29
top_element            29
top_element_info       29
transfers_made          0
most_captained         29
most_vice_captained    29
dtype: int64

In [165]:
try:
    events_user_management_df.dropna(inplace=True, axis=0)
except:
    pass
events_user_management_df.isnull().sum()

id                     0
chip_plays             0
most_selected          0
most_transferred_in    0
top_element            0
top_element_info       0
transfers_made         0
most_captained         0
most_vice_captained    0
dtype: int64

In [166]:
game_settings_full_dict = general_info_json.get('game_settings')
game_settings_full_dict.keys()

dict_keys(['league_join_private_max', 'league_join_public_max', 'league_max_size_public_classic', 'league_max_size_public_h2h', 'league_max_size_private_h2h', 'league_max_ko_rounds_private_h2h', 'league_prefix_public', 'league_points_h2h_win', 'league_points_h2h_lose', 'league_points_h2h_draw', 'league_ko_first_instead_of_random', 'cup_start_event_id', 'cup_stop_event_id', 'cup_qualifying_method', 'cup_type', 'squad_squadplay', 'squad_squadsize', 'squad_team_limit', 'squad_total_spend', 'ui_currency_multiplier', 'ui_use_special_shirts', 'ui_special_shirt_exclusions', 'stats_form_days', 'sys_vice_captain_enabled', 'transfers_cap', 'transfers_sell_on_fee', 'league_h2h_tiebreak_stats', 'timezone'])

In [167]:
#def moveKey(dictionary, key, newDictionary):
#    if key in dictionary:
#        newDictionary[key] = dictionary.get(key)
#        del dictionary[key]

In [168]:
#game_settings_league_dict = {}
#for key in ['league_join_private_max', 'league_join_public_max', 'league_max_size_public_classic', 'league_max_size_public_h2h', 'league_max_size_private_h2h', 'league_max_size_private_h2h', 'league_max_ko_rounds_private_h2h', 'league_prefix_public', 'league_points_h2h_win', 'league_points_h2h_lose', 'league_points_h2h_draw', 'league_ko_first_instead_of_random', ]:
#    moveKey(game_settings_full_dict, key, game_settings_league_dict)

#game_settings_full_dict.keys(), game_settings_league_dict.keys()

In [169]:
phases_df = pd.DataFrame(general_info_json['phases'])
phases_df.isnull().sum()

id             0
name           0
start_event    0
stop_event     0
dtype: int64

In [170]:
teams_df = pd.DataFrame(general_info_json['teams'])
teams_df.isnull().sum()

code                      0
draw                      0
form                     20
id                        0
loss                      0
name                      0
played                    0
points                    0
position                  0
short_name                0
strength                  0
team_division            20
unavailable               0
win                       0
strength_overall_home     0
strength_overall_away     0
strength_attack_home      0
strength_attack_away      0
strength_defence_home     0
strength_defence_away     0
pulse_id                  0
dtype: int64

We are not dropping the 'form' column as that is something which will be useful in our models, despite the missing values (we will calculate our own values)

In [171]:
# Need to work out the best way to calculate form of a team.
def calculateTeamForm():
    return

In [172]:

try:
    teams_df.drop('id', inplace=True, axis=1)
    teams_df.drop('team_division', inplace=True, axis=1)
except:
    pass
teams_df.isnull().sum()

code                      0
draw                      0
form                     20
loss                      0
name                      0
played                    0
points                    0
position                  0
short_name                0
strength                  0
unavailable               0
win                       0
strength_overall_home     0
strength_overall_away     0
strength_attack_home      0
strength_attack_away      0
strength_defence_home     0
strength_defence_away     0
pulse_id                  0
dtype: int64

In [173]:
teams_results_df = teams_df[['code', 'position', 'played', 'win', 'draw', 'loss', 'points']]

In [174]:
teams_name_dict = teams_df[['code', 'name']].set_index('code')['name'].to_dict

In [175]:
teams_short_name_dict = teams_df[['code', 'short_name']].set_index('code')['short_name'].to_dict

In [176]:
teams_rating_df = teams_df[['code', 'strength', 'strength_attack_home', 'strength_defence_home', 'strength_overall_home', 'strength_attack_away', 'strength_defence_away', 'strength_overall_away']]

In [177]:
teams_unavailable_dict = teams_df[['code', 'unavailable']].set_index('code')['unavailable'].to_dict

In [178]:
teams_pulse_id_dict = teams_df[['code', 'pulse_id']].set_index('code')['pulse_id'].to_dict

In [179]:
total_players = general_info_json.get('total_players')

In [180]:
elements_df = pd.DataFrame(general_info_json['elements'])

In [181]:
elements_df.keys()

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round', 'code',
       'cost_change_event', 'cost_change_event_fall', 'cost_change_start',
       'cost_change_start_fall', 'dreamteam_count', 'element_type', 'ep_next',
       'ep_this', 'event_points', 'first_name', 'form', 'id', 'in_dreamteam',
       'news', 'news_added', 'now_cost', 'photo', 'points_per_game',
       'second_name', 'selected_by_percent', 'special', 'squad_number',
       'status', 'team', 'team_code', 'total_points', 'transfers_in',
       'transfers_in_event', 'transfers_out', 'transfers_out_event',
       'value_form', 'value_season', 'web_name', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'influence_rank', 'influence_rank_type', 'creativity_rank',
       'creativity_rank_type', 'threat_rank'

In [182]:
player_id_df = elements_df[['id', 'code', 'first_name', 'second_name', 'web_name', 'photo']]
player_id_df.isnull().sum()

id             0
code           0
first_name     0
second_name    0
web_name       0
photo          0
dtype: int64

In [183]:
player_team_position_df = elements_df[['id', 'team_code', 'element_type']]
player_team_position_df.isnull().sum()

id              0
team_code       0
element_type    0
dtype: int64

In [184]:
player_cost_df = elements_df[['id', 'now_cost', 'cost_change_start', 'cost_change_start_fall', 'cost_change_event', 'cost_change_event_fall']]
player_cost_df.isnull().sum()

id                        0
now_cost                  0
cost_change_start         0
cost_change_start_fall    0
cost_change_event         0
cost_change_event_fall    0
dtype: int64

In [185]:
player_status_df = elements_df[['id', 'status']]
player_status_df.isnull().sum()

id        0
status    0
dtype: int64

In [186]:
player_points_df = elements_df[['id', 'event_points', 'total_points', 'bonus', 'bps']]
player_points_df.isnull().sum()

id              0
event_points    0
total_points    0
bonus           0
bps             0
dtype: int64

In [187]:
player_user_transfers_df = elements_df[['id', 'transfers_in', 'transfers_in_event', 'transfers_out', 'transfers_out_event', 'selected_by_percent']]
player_user_transfers_df.isnull().sum()

id                     0
transfers_in           0
transfers_in_event     0
transfers_out          0
transfers_out_event    0
selected_by_percent    0
dtype: int64

In [188]:
player_stats_df = elements_df[['id', 'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded', 'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards', 'saves']]
player_stats_df.isnull().sum()

id                  0
minutes             0
goals_scored        0
assists             0
clean_sheets        0
goals_conceded      0
own_goals           0
penalties_saved     0
penalties_missed    0
yellow_cards        0
red_cards           0
saves               0
dtype: int64

In [189]:
player_value_df = elements_df[['id', 'value_form', 'value_season', 'points_per_game']]
player_value_df.isnull().sum()

id                 0
value_form         0
value_season       0
points_per_game    0
dtype: int64

In [190]:
player_ep_ict_df = elements_df[['id', 'ep_next', 'ep_this', 'influence', 'influence_rank', 'creativity', 'creativity_rank', 'creativity_rank_type', 'threat', 'threat_rank', 'threat_rank_type', 'ict_index', 'ict_index_rank', 'ict_index_rank_type']]
player_ep_ict_df.isnull().sum()

id                      0
ep_next                 0
ep_this                 0
influence               0
influence_rank          0
creativity              0
creativity_rank         0
creativity_rank_type    0
threat                  0
threat_rank             0
threat_rank_type        0
ict_index               0
ict_index_rank          0
ict_index_rank_type     0
dtype: int64

In [191]:
player_set_piece_df = elements_df[['id', 'team_code', 'corners_and_indirect_freekicks_order', 'direct_freekicks_order', 'penalties_order']]
player_set_piece_df.isnull().sum(), player_set_piece_df.notnull().sum()

(id                                        0
 team_code                                 0
 corners_and_indirect_freekicks_order    524
 direct_freekicks_order                  547
 penalties_order                         552
 dtype: int64,
 id                                      618
 team_code                               618
 corners_and_indirect_freekicks_order     94
 direct_freekicks_order                   71
 penalties_order                          66
 dtype: int64)

Most of the data in this dataframe is missing, however, the data that we have stored is useful.

In [192]:
player_set_piece_df.corners_and_indirect_freekicks_order.unique(), player_set_piece_df.direct_freekicks_order.unique(), player_set_piece_df.penalties_order.unique()

(array([nan,  4.,  1.,  3.,  2.,  5.,  6.,  7.]),
 array([nan,  5.,  4.,  3.,  2.,  1.]),
 array([nan,  1.,  2.,  3.,  4.,  5.,  6.]))

From the unique values shown, we can see that there is a good amount of useful data. The missing data is due to the unlikelyhood of more than 5-7 players being missing from a game. To confirm that there is a decent set piece order stored for every team, we will create bar charts to show the spread of data.

In [193]:
player_set_piece_df.corners_and_indirect_freekicks_order.describe()

count    94.000000
mean      3.031915
std       1.596044
min       1.000000
25%       2.000000
50%       3.000000
75%       4.000000
max       7.000000
Name: corners_and_indirect_freekicks_order, dtype: float64

In [194]:
player_set_piece_df.direct_freekicks_order.describe()

count    71.000000
mean      2.521127
std       1.296800
min       1.000000
25%       1.000000
50%       2.000000
75%       3.500000
max       5.000000
Name: direct_freekicks_order, dtype: float64

In [195]:
player_set_piece_df.penalties_order.describe()

count    66.000000
mean      2.439394
std       1.254270
min       1.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       6.000000
Name: penalties_order, dtype: float64

As we can see from these dataframe stats, every team has at least 1 player who they prefer to take a set piece over other players. Also, the mean values show that for most of the teams there is a decent sized order of preference. For that reason, we will keep the data we have, just change the way it is stored so that it is grouped by the team (soon).

In [196]:
try:
    player_set_piece_df.fillna(0, inplace=True)
except:
    pass
player_set_piece_df.isnull().sum()

id                                      0
team_code                               0
corners_and_indirect_freekicks_order    0
direct_freekicks_order                  0
penalties_order                         0
dtype: int64

In [197]:
news_df = elements_df[['id', 'news', 'news_added']]
news_df.isnull().sum()

id              0
news            0
news_added    285
dtype: int64

News is important, however, should only be used if recent and up to date as old news will skew the predictions. We will remove any data which does not have a timestamp.

In [198]:
try:
    news_df.dropna(inplace=True, axis=0)
except:
    pass
news_df.isnull().sum()

id            0
news          0
news_added    0
dtype: int64

In [199]:
player_dreamteam_df = elements_df[['id', 'dreamteam_count', 'in_dreamteam']]
player_dreamteam_df.isnull().sum()

id                 0
dreamteam_count    0
in_dreamteam       0
dtype: int64

In [200]:
chance_of_playing_df = elements_df[['id', 'status', 'chance_of_playing_this_round', 'chance_of_playing_next_round']]
chance_of_playing_df.isnull().sum()

id                                0
status                            0
chance_of_playing_this_round    288
chance_of_playing_next_round    285
dtype: int64

I suspect that the missing data in this dataframe is due to the status of players, we can check this by finding out the types of status and splitting the dataframe. 

In [201]:
chance_of_playing_df.status.unique()

array(['a', 'u', 'i', 'd', 'n', 's'], dtype=object)

In [202]:
status_grouped_df = chance_of_playing_df.groupby(chance_of_playing_df.status)
a_status_df = status_grouped_df.get_group('a') # available
u_status_df = status_grouped_df.get_group('u') # unavailable
i_status_df = status_grouped_df.get_group('i') # injured
d_status_df = status_grouped_df.get_group('d') # returning from injury soon
n_status_df = status_grouped_df.get_group('n') # other
s_status_df = status_grouped_df.get_group('s') # suspended

Now that the dataframes have been split, we can check which have the most missing data to see if one group is having a large affent on the amount, or whether it is random missing data.

In [203]:
a_status_df.isnull().sum(), u_status_df.isnull().sum(), i_status_df.isnull().sum(), d_status_df.isnull().sum(), n_status_df.isnull().sum(), s_status_df.isnull().sum()

(id                                0
 status                            0
 chance_of_playing_this_round    285
 chance_of_playing_next_round    285
 dtype: int64,
 id                              0
 status                          0
 chance_of_playing_this_round    0
 chance_of_playing_next_round    0
 dtype: int64,
 id                              0
 status                          0
 chance_of_playing_this_round    0
 chance_of_playing_next_round    0
 dtype: int64,
 id                              0
 status                          0
 chance_of_playing_this_round    2
 chance_of_playing_next_round    0
 dtype: int64,
 id                              0
 status                          0
 chance_of_playing_this_round    0
 chance_of_playing_next_round    0
 dtype: int64,
 id                              0
 status                          0
 chance_of_playing_this_round    1
 chance_of_playing_next_round    0
 dtype: int64)

This is the majority of the missing data, as suspected, the status plays a part in this. I assume it is difficult to gather predictions on whether or not an available player is going to play without further calculations.

For now, we can use this dataframe to store the chance of availability of players who are not 100% available (status != a)

In [204]:
# removing all rows of 100% available players.
chance_of_playing_df = chance_of_playing_df[chance_of_playing_df.status != 'a']
chance_of_playing_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    3
chance_of_playing_next_round    0
dtype: int64

As there is still a small amount of missing data which cannot be predicted accurately, it is easier to just presume the player will not be available, to avaid losing points in the event.

In [205]:
try:
    chance_of_playing_df.fillna(0, inplace=True)
except:
    pass
chance_of_playing_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    0
chance_of_playing_next_round    0
dtype: int64

In [206]:
element_stats_df = pd.DataFrame(general_info_json['element_stats'])
element_stats_df.isnull().sum()

label    0
name     0
dtype: int64

In [207]:
element_types_df = pd.DataFrame(general_info_json['element_types'])
element_types_df.isnull().sum()

id                      0
plural_name             0
plural_name_short       0
singular_name           0
singular_name_short     0
squad_select            0
squad_min_play          0
squad_max_play          0
ui_shirt_specific       0
sub_positions_locked    0
element_count           0
dtype: int64

In [208]:
# gets a list of keys of the json
fixtures_json[0].keys()

dict_keys(['code', 'event', 'finished', 'finished_provisional', 'id', 'kickoff_time', 'minutes', 'provisional_start_time', 'started', 'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats', 'team_h_difficulty', 'team_a_difficulty', 'pulse_id'])

In [209]:
fixtures_df = pd.DataFrame(fixtures_json)
fixtures_df.head()

Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty,pulse_id
0,2210271,1,True,True,1,2021-08-13T19:00:00Z,90,False,True,1,0.0,3,2.0,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",3,2,66342
1,2210276,1,True,True,6,2021-08-14T11:30:00Z,90,False,True,10,1.0,13,5.0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,66347
2,2210272,1,True,True,2,2021-08-14T14:00:00Z,90,False,True,4,2.0,5,1.0,"[{'identifier': 'goals_scored', 'a': [{'value'...",3,2,66343
3,2210273,1,True,True,3,2021-08-14T14:00:00Z,90,False,True,7,0.0,6,3.0,"[{'identifier': 'goals_scored', 'a': [], 'h': ...",2,5,66344
4,2210274,1,True,True,4,2021-08-14T14:00:00Z,90,False,True,16,1.0,8,3.0,"[{'identifier': 'goals_scored', 'a': [{'value'...",2,4,66345


In [210]:
fixtures_event_code_df = fixtures_df[['id', 'code', 'event']]
fixtures_event_code_df.isnull().sum()

id       0
code     0
event    0
dtype: int64

In [211]:
fixtures_timings_df = fixtures_df[['id', 'kickoff_time', 'minutes', 'started', 'finished', 'finished_provisional']]
fixtures_timings_df.isnull().sum()

id                      0
kickoff_time            0
minutes                 0
started                 0
finished                0
finished_provisional    0
dtype: int64

In [212]:
fixtures_teams_df = fixtures_df[['id', 'team_h', 'team_a']]
fixtures_teams_df.isnull().sum()

id        0
team_h    0
team_a    0
dtype: int64

In [213]:
fixtures_scores_df = fixtures_df[['id', 'team_h_score', 'team_a_score']]
fixtures_scores_df.isnull().sum()

id                0
team_h_score    293
team_a_score    293
dtype: int64

In [214]:
try:
    fixtures_scores_df.dropna(inplace=True, axis=0)
except:
    pass
fixtures_scores_df.isnull().sum()

id              0
team_h_score    0
team_a_score    0
dtype: int64

In [218]:
fixtures_difficulty_df = fixtures_df[['id', 'team_h_difficulty', 'team_a_difficulty']]
fixtures_difficulty_df.isnull().sum()

id                   0
team_h_difficulty    0
team_a_difficulty    0
dtype: int64