In [218]:
# imports
import requests
import pandas as pd
import numpy as np

In [219]:
# gets data from API
def fetchData(url):
    try:
        r = requests.get(url)
        return r.json()
    except:
        return None

In [220]:
# calls fetchData function for chosen URL
general_info_json = fetchData('https://fantasy.premierleague.com/api/bootstrap-static/')
fixtures_json = fetchData('https://fantasy.premierleague.com/api/fixtures/')
#players_detailed_json = fetchData('https://fantasy.premierleague.com/api/element-summary/{' + element_id + '}/')
#gameweek_live_json = fetchData('https://fantasy.premierleague.com/api/event/{' + event_id + '}/live/')

In [221]:
# gets a list of keys of the json
general_info_json.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [222]:
events_df = pd.DataFrame(general_info_json['events'])

In [223]:
# shows how many values are null for each column
events_df.isnull().sum()

id                            0
name                          0
deadline_time                 0
average_entry_score           0
finished                      0
data_checked                  0
highest_scoring_entry        30
deadline_time_epoch           0
deadline_time_game_offset     0
highest_score                30
is_previous                   0
is_current                    0
is_next                       0
chip_plays                    0
most_selected                30
most_transferred_in          30
top_element                  30
top_element_info             30
transfers_made                0
most_captained               30
most_vice_captained          30
dtype: int64

In [224]:
# creates 2 dataframes with rows split based on whether the value in "finished" is True or False
data_checked = events_df.groupby(events_df.finished)
data_checked_df = data_checked.get_group(True)
data_not_checked_df = data_checked.get_group(False)

# https://www.delftstack.com/howto/python-pandas/split-pandas-dataframe/

In [225]:
data_checked_df.isnull().sum()

id                           0
name                         0
deadline_time                0
average_entry_score          0
finished                     0
data_checked                 0
highest_scoring_entry        0
deadline_time_epoch          0
deadline_time_game_offset    0
highest_score                0
is_previous                  0
is_current                   0
is_next                      0
chip_plays                   0
most_selected                0
most_transferred_in          0
top_element                  0
top_element_info             0
transfers_made               0
most_captained               0
most_vice_captained          0
dtype: int64

No missing values in data_checked_df (all missing values moved to data_not_checked_df)

In [226]:
data_not_checked_df.isnull().sum()

id                            0
name                          0
deadline_time                 0
average_entry_score           0
finished                      0
data_checked                  0
highest_scoring_entry        30
deadline_time_epoch           0
deadline_time_game_offset     0
highest_score                30
is_previous                   0
is_current                    0
is_next                       0
chip_plays                    0
most_selected                30
most_transferred_in          30
top_element                  30
top_element_info             30
transfers_made                0
most_captained               30
most_vice_captained          30
dtype: int64

In [227]:
# removes all columns with missing data from data_not_checked_df
try:
    data_not_checked_df.dropna(axis=1, inplace=True)
except:
    pass
data_not_checked_df.isnull().sum()

id                           0
name                         0
deadline_time                0
average_entry_score          0
finished                     0
data_checked                 0
deadline_time_epoch          0
deadline_time_game_offset    0
is_previous                  0
is_current                   0
is_next                      0
chip_plays                   0
transfers_made               0
dtype: int64

This proves that the removed columns from data_not_checked_df were missing values in certain columns, as those held data which can only exist once the data has been checked (as suspected). Since we have proven this, we can refactor events_df into two new dataframes, one containing the general event data and the other containing detailed data and stats, only for the events which have had the data checked.

In [228]:
# general event data (checked and non-checked data)
general_events_info_df = events_df[['id', 'name', 'deadline_time', 'is_previous', 'is_current', 'is_next', 'finished', 'data_checked']]

In [229]:
general_events_info_df.isnull().sum()

id               0
name             0
deadline_time    0
is_previous      0
is_current       0
is_next          0
finished         0
data_checked     0
dtype: int64

No missing data as data can exist without the data being checked and updated once the event is finished.

In [230]:
# detailed data and stats (checked data only)
general_events_stats_df = data_checked_df[['id', 'average_entry_score', 'deadline_time_epoch', 'deadline_time_game_offset', 'chip_plays', 'transfers_made']]

In [231]:
general_events_stats_df.isnull().sum()

id                           0
average_entry_score          0
deadline_time_epoch          0
deadline_time_game_offset    0
chip_plays                   0
transfers_made               0
dtype: int64

We now have the two dataframes as planned with no missing data.

In [232]:
phases_df = pd.DataFrame(general_info_json['phases'])
phases_df.isnull().sum()

id             0
name           0
start_event    0
stop_event     0
dtype: int64

In [233]:
teams_df = pd.DataFrame(general_info_json['teams'])
teams_df.isnull().sum()

code                      0
draw                      0
form                     20
id                        0
loss                      0
name                      0
played                    0
points                    0
position                  0
short_name                0
strength                  0
team_division            20
unavailable               0
win                       0
strength_overall_home     0
strength_overall_away     0
strength_attack_home      0
strength_attack_away      0
strength_defence_home     0
strength_defence_away     0
pulse_id                  0
dtype: int64

In [234]:
# Need to work out the best way to calculate form of a team.
def calculateTeamForm():
    return

In [235]:

try:
    teams_df.drop('team_division', inplace=True, axis=1)
except:
    pass
teams_df.isnull().sum()

code                      0
draw                      0
form                     20
id                        0
loss                      0
name                      0
played                    0
points                    0
position                  0
short_name                0
strength                  0
unavailable               0
win                       0
strength_overall_home     0
strength_overall_away     0
strength_attack_home      0
strength_attack_away      0
strength_defence_home     0
strength_defence_away     0
pulse_id                  0
dtype: int64

We are not dropping the 'form' column as that is something which will be useful in our models, despite the missing values (we will calculate our own values)

In [236]:
general_info_json.get('total_players')

8429286

In [237]:
elements_df = pd.DataFrame(general_info_json['elements'])

In [238]:
elements_df.keys()

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round', 'code',
       'cost_change_event', 'cost_change_event_fall', 'cost_change_start',
       'cost_change_start_fall', 'dreamteam_count', 'element_type', 'ep_next',
       'ep_this', 'event_points', 'first_name', 'form', 'id', 'in_dreamteam',
       'news', 'news_added', 'now_cost', 'photo', 'points_per_game',
       'second_name', 'selected_by_percent', 'special', 'squad_number',
       'status', 'team', 'team_code', 'total_points', 'transfers_in',
       'transfers_in_event', 'transfers_out', 'transfers_out_event',
       'value_form', 'value_season', 'web_name', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'influence_rank', 'influence_rank_type', 'creativity_rank',
       'creativity_rank_type', 'threat_rank'

In [239]:
chance_of_playing_df = elements_df[['id', 'status', 'chance_of_playing_this_round', 'chance_of_playing_next_round']]
chance_of_playing_df.isnull().sum()

id                                0
status                            0
chance_of_playing_this_round    297
chance_of_playing_next_round    291
dtype: int64

I suspect that the missing data in this dataframe is due to the status of players, we can check this by finding out the types of status and splitting the dataframe. 

In [240]:
chance_of_playing_df.status.unique()

array(['a', 'u', 'i', 'd', 'n', 's'], dtype=object)

In [241]:
status_grouped_df = chance_of_playing_df.groupby(chance_of_playing_df.status)
a_status_df = status_grouped_df.get_group('a') # available
u_status_df = status_grouped_df.get_group('u') # unavailable
i_status_df = status_grouped_df.get_group('i') # injured
d_status_df = status_grouped_df.get_group('d') # returning from injury soon
n_status_df = status_grouped_df.get_group('n') # other
s_status_df = status_grouped_df.get_group('s') # suspended

Now that the dataframes have been split, we can check which have the most missing data to see if one group is having a large affent on the amount, or whether it is random missing data.

In [242]:
a_status_df.isnull().sum()

id                                0
status                            0
chance_of_playing_this_round    291
chance_of_playing_next_round    291
dtype: int64

This is the majority of the missing data, as suspected, the status plays a part in this. I assume it is difficult to gather predictions on whether or not an available player is going to play without further calculations.

In [243]:
u_status_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    0
chance_of_playing_next_round    0
dtype: int64

In [244]:
i_status_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    2
chance_of_playing_next_round    0
dtype: int64

In [245]:
d_status_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    4
chance_of_playing_next_round    0
dtype: int64

In [246]:
n_status_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    0
chance_of_playing_next_round    0
dtype: int64

In [247]:
s_status_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    0
chance_of_playing_next_round    0
dtype: int64

For now, we can use this dataframe to store the chance of availability of players who are not 100% available (status != a)

In [248]:
# removing all rows of 100% available players.
chance_of_playing_df = chance_of_playing_df[chance_of_playing_df.status != 'a']
chance_of_playing_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    6
chance_of_playing_next_round    0
dtype: int64

As there are still 6 pieces of missing data which cannot be predicted accurately, it is easier to just presume the player will not be available, to avaid losing points in the league.

In [249]:
try:
    chance_of_playing_df.fillna(0, inplace=True)
except:
    pass
chance_of_playing_df.isnull().sum()

id                              0
status                          0
chance_of_playing_this_round    0
chance_of_playing_next_round    0
dtype: int64

In [250]:
element_stats_df = pd.DataFrame(general_info_json['element_stats'])
element_stats_df.isnull().sum()

label    0
name     0
dtype: int64

In [251]:
element_types_df = pd.DataFrame(general_info_json['element_types'])
element_types_df.isnull().sum()

id                      0
plural_name             0
plural_name_short       0
singular_name           0
singular_name_short     0
squad_select            0
squad_min_play          0
squad_max_play          0
ui_shirt_specific       0
sub_positions_locked    0
element_count           0
dtype: int64

In [252]:
# gets a list of keys of the json
fixtures_json[0].keys()

dict_keys(['code', 'event', 'finished', 'finished_provisional', 'id', 'kickoff_time', 'minutes', 'provisional_start_time', 'started', 'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats', 'team_h_difficulty', 'team_a_difficulty', 'pulse_id'])