# Clean Element-Summary

I initially want to extract all __element-summary>history__ information into a single table. This means that each row contains stats for a specific player in a specfic fixture.


#### Import Packages

In [1]:
import pandas as pd
import numpy as np
import time
import collections
import json

#### Read all element-summary files

Initialise loop parameters and run loop. This loop will open all element-summary files, extract the history fields and append to a master table.

In [2]:
# number of players
# hardcoded as this is fixed and saves time counting files.
nPlayers = 624

# initialise count.
# this will increase for each player's fixture
count = 0;

# get column names
# bool which changes to false after one step of the loop to get column headers once.
gcn = True

# Empty ordered dictionary file
# all player's fixtures will be stored here
data_dict = collections.OrderedDict()

# Start timer
start_time = time.time()

# Loop through all players
for i in range(1,nPlayers+1):
    
    # Open json file
    with open('../../data/json/element-summary/' + str(i) + '.json') as f:
        es = json.load(f)
        
    # Number of fixtures for this player
    nFixtures = len(es["history"])
    
    # Get column names. only do this once
    if gcn:
        cn = es["history"][0].keys()
        gcn = False
    
    # Loop through this players fixtures
    for m in range(0, nFixtures):
        a = es["history"][m]
        data_dict[count] = a
        count = count+1

data = pd.DataFrame.from_dict(data_dict, orient="index")
print("Time Taken: %s seconds" % (time.time() - start_time))

# View DataFrame
data.head()

Time Taken: 34.52191233634949 seconds


Unnamed: 0,id,kickoff_time,kickoff_time_formatted,team_h_score,team_a_score,was_home,round,total_points,value,transfers_balance,...,errors_leading_to_goal,errors_leading_to_goal_attempt,tackled,offside,target_missed,fouls,dribbles,element,fixture,opponent_team
0,1,2018-08-12T15:00:00Z,12 Aug 16:00,0,2,True,1,3,50,0,...,0,0,0,0,0,0,0,1,1,13
1,526,2018-08-18T16:30:00Z,18 Aug 17:30,3,2,False,2,3,50,188,...,0,0,0,0,0,0,0,1,14,6
2,1053,2018-08-25T14:00:00Z,25 Aug 15:00,3,1,True,3,3,50,8540,...,0,0,0,0,0,0,0,1,21,19
3,1584,2018-09-02T12:30:00Z,02 Sep 13:30,2,3,False,4,1,50,9582,...,0,2,0,0,0,0,0,1,33,5
4,2121,2018-09-15T14:00:00Z,15 Sep 15:00,1,2,False,5,2,50,-3297,...,0,0,0,0,0,0,0,1,46,15


Drop columns with no use. First, take a look at all the columns.

In [3]:
print(data.shape)

(21790, 54)


In [4]:
print(data.columns)

Index(['id', 'kickoff_time', 'kickoff_time_formatted', 'team_h_score',
       'team_a_score', 'was_home', 'round', 'total_points', 'value',
       'transfers_balance', 'selected', 'transfers_in', 'transfers_out',
       'loaned_in', 'loaned_out', 'minutes', 'goals_scored', 'assists',
       'clean_sheets', 'goals_conceded', 'own_goals', 'penalties_saved',
       'penalties_missed', 'yellow_cards', 'red_cards', 'saves', 'bonus',
       'bps', 'influence', 'creativity', 'threat', 'ict_index', 'ea_index',
       'open_play_crosses', 'big_chances_created',
       'clearances_blocks_interceptions', 'recoveries', 'key_passes',
       'tackles', 'winning_goals', 'attempted_passes', 'completed_passes',
       'penalties_conceded', 'big_chances_missed', 'errors_leading_to_goal',
       'errors_leading_to_goal_attempt', 'tackled', 'offside', 'target_missed',
       'fouls', 'dribbles', 'element', 'fixture', 'opponent_team'],
      dtype='object')


Now I remove the bloaty columns and modify indexing to begin at 0.

In [5]:
# Copy data
data_clean = data.copy()

# Restart indexing from 0
data_clean['element'] = data_clean['element']-1
data_clean['fixture'] = data_clean['fixture']-1
data_clean['opponent_team'] = data_clean['opponent_team']-1

# Look at new table
data_clean.head()

Unnamed: 0,id,kickoff_time,kickoff_time_formatted,team_h_score,team_a_score,was_home,round,total_points,value,transfers_balance,...,errors_leading_to_goal,errors_leading_to_goal_attempt,tackled,offside,target_missed,fouls,dribbles,element,fixture,opponent_team
0,1,2018-08-12T15:00:00Z,12 Aug 16:00,0,2,True,1,3,50,0,...,0,0,0,0,0,0,0,0,0,12
1,526,2018-08-18T16:30:00Z,18 Aug 17:30,3,2,False,2,3,50,188,...,0,0,0,0,0,0,0,0,13,5
2,1053,2018-08-25T14:00:00Z,25 Aug 15:00,3,1,True,3,3,50,8540,...,0,0,0,0,0,0,0,0,20,18
3,1584,2018-09-02T12:30:00Z,02 Sep 13:30,2,3,False,4,1,50,9582,...,0,2,0,0,0,0,0,0,32,4
4,2121,2018-09-15T14:00:00Z,15 Sep 15:00,1,2,False,5,2,50,-3297,...,0,0,0,0,0,0,0,0,45,14


Now that we have built our table and removed unecessary columns, we can sort by element (selection) and then by date.

In [6]:
# sort first by element number, and then by kickoff time
data_clean = data_clean.sort_values(by=['element', 'kickoff_time']).reset_index(drop=True)
data_clean.head()

Unnamed: 0,id,kickoff_time,kickoff_time_formatted,team_h_score,team_a_score,was_home,round,total_points,value,transfers_balance,...,errors_leading_to_goal,errors_leading_to_goal_attempt,tackled,offside,target_missed,fouls,dribbles,element,fixture,opponent_team
0,1,2018-08-12T15:00:00Z,12 Aug 16:00,0,2,True,1,3,50,0,...,0,0,0,0,0,0,0,0,0,12
1,526,2018-08-18T16:30:00Z,18 Aug 17:30,3,2,False,2,3,50,188,...,0,0,0,0,0,0,0,0,13,5
2,1053,2018-08-25T14:00:00Z,25 Aug 15:00,3,1,True,3,3,50,8540,...,0,0,0,0,0,0,0,0,20,18
3,1584,2018-09-02T12:30:00Z,02 Sep 13:30,2,3,False,4,1,50,9582,...,0,2,0,0,0,0,0,0,32,4
4,2121,2018-09-15T14:00:00Z,15 Sep 15:00,1,2,False,5,2,50,-3297,...,0,0,0,0,0,0,0,0,45,14


Save as .csv

In [7]:
data_clean.to_csv(r'../../data/csv/element-summary_full.csv', index=False, index_label=False)

In [8]:
data_clean[0:4].transpose().to_html('../../data/html/element_summary.html')

In [9]:
data_clean.shape

(21790, 54)

In [10]:
2*38 + 35 + 32

143

In [11]:
4*((38+38+35+32)/4)

143.0

In [12]:
data_new = data_clean.copy()
#data_new['kickoff_time'] = pd.Series(['...']*len(data_new))
data_new.iloc[list(range(0,5))+[len(data_new)-1]].transpose().to_html('../../data/html/data_new.html', bold_rows=False)