# All Data to One Table

In this notebook, all the data identified as useful is combined into one table. The data loaded in has been cleaned up before hand to remove bloaty columns and indexing has been set to begin at 1.

#### Import Packages

In [1]:
import pandas as pd
import numpy as np
import time as time

#### Read in clean data files

In [2]:
elements = pd.read_csv('../../data/csv/elements.csv')
fixtures = pd.read_csv('../../data/csv/fixtures.csv')
teams = pd.read_csv('../../data/csv/teams.csv')
element_summary = pd.read_csv('../../data/csv/element-summary.csv')
element_types = pd.read_csv('../../data/csv/element_types.csv')

#### View data structurs

In [3]:
elements.head()

Unnamed: 0,team,element_type,first_name,second_name,web_name
0,0,0,Petr,Cech,Cech
1,0,0,Bernd,Leno,Leno
2,0,1,Laurent,Koscielny,Koscielny
3,0,1,Hector,Bellerin,Bellerin
4,0,1,Nacho,Monreal,Monreal


In [4]:
fixtures.head()

Unnamed: 0,id,event,kickoff_time,team_h,team_a,team_h_score,team_a_score
0,0,1,2018-08-12 15:00:00+00:00,0,12,0,2
1,1,1,2018-08-11 14:00:00+00:00,1,4,2,0
2,2,1,2018-08-11 14:00:00+00:00,8,6,0,2
3,3,1,2018-08-11 14:00:00+00:00,9,5,0,3
4,4,1,2018-08-12 12:30:00+00:00,11,18,4,0


In [5]:
teams.head()

Unnamed: 0,name,short_name,strength,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away,strength_overall_home,strength_overall_away
0,Arsenal,ARS,4,1240,1270,1310,1340,1260,1320
1,Bournemouth,BOU,3,1040,1100,1120,1130,1030,1130
2,Brighton,BHA,2,1040,1140,1010,1070,1030,1050
3,Burnley,BUR,3,990,1030,1000,1040,1070,1100
4,Cardiff,CAR,2,1030,1060,1020,1090,1030,1080


In [6]:
element_summary.head()

Unnamed: 0,element,round,kickoff_time,fixture,opponent_team,was_home,minutes,total_points
0,0,1,2018-08-12T15:00:00Z,0,12,True,90,3
1,0,2,2018-08-18T16:30:00Z,13,5,False,90,3
2,0,3,2018-08-25T14:00:00Z,20,18,True,90,3
3,0,4,2018-09-02T12:30:00Z,32,4,False,90,1
4,0,5,2018-09-15T14:00:00Z,45,14,False,90,2


#### Add the selection's team as a new column

Using element_summary as a starting point, append a new column with an identifier of the selections team.

In order to do this, I need to loop through every row in the element_summary table, look at the fixture, get the teams involved, use the "was_home" boolean to pick the home or away team.

In [7]:
# Number of rows in element_summary
nrows = len(element_summary)

# Empty array for the selection's team
team = np.zeros(nrows, dtype='int')

# Loop through rows
for i in range(nrows):
    
    # was the selection playing at home?
    was_home = element_summary.loc[i,'was_home']
    
    # What is the fixture ID?
    fixture_id = element_summary.loc[i, 'fixture']
    
    # Look at teams invloved in the fixture.
    # If the selection played at home, choose the home team.
    # If the selection played away, choose the away team.
    if was_home:
        team[i] = fixtures.loc[fixture_id,'team_h']
    else:
        team[i] = fixtures.loc[fixture_id,'team_a']
        
# Add the team varaible to element_sumamry
element_summary['team'] = pd.Series(team)
element_summary.head()

Unnamed: 0,element,round,kickoff_time,fixture,opponent_team,was_home,minutes,total_points,team
0,0,1,2018-08-12T15:00:00Z,0,12,True,90,3,0
1,0,2,2018-08-18T16:30:00Z,13,5,False,90,3,0
2,0,3,2018-08-25T14:00:00Z,20,18,True,90,3,0
3,0,4,2018-09-02T12:30:00Z,32,4,False,90,1,0
4,0,5,2018-09-15T14:00:00Z,45,14,False,90,2,0


#### Add selection's element-type to table

Using a similar method as used above, look thorugh the elements data structure and append the selection's element-type (position) and web_name to the table.

In [8]:
# empty array to store element-type
element_type_col = np.zeros(nrows, dtype='int')

# List of player names
element_name = ['']*nrows

# loop through rows
for i in range(nrows):
    
    # Get the selection's element id
    element_id = element_summary.loc[i, 'element']
    
    # Get element type
    element_type_col[i] = elements.loc[element_id, 'element_type']
    
    # Get element name
    element_name[i] = elements.loc[element_id, 'web_name']
    
element_summary['element_type'] = pd.Series(element_type_col)
element_summary['element_name'] = pd.Series(element_name)
element_summary.head()

Unnamed: 0,element,round,kickoff_time,fixture,opponent_team,was_home,minutes,total_points,team,element_type,element_name
0,0,1,2018-08-12T15:00:00Z,0,12,True,90,3,0,0,Cech
1,0,2,2018-08-18T16:30:00Z,13,5,False,90,3,0,0,Cech
2,0,3,2018-08-25T14:00:00Z,20,18,True,90,3,0,0,Cech
3,0,4,2018-09-02T12:30:00Z,32,4,False,90,1,0,0,Cech
4,0,5,2018-09-15T14:00:00Z,45,14,False,90,2,0,0,Cech


#### Rearrange columns for readibility

In [9]:
keep_cols = ['element_name', 'element', 'element_type', 'fixture', 'kickoff_time', 'was_home', 'team', 'opponent_team', 'minutes', 'total_points']
final_table = element_summary[keep_cols]
final_table.head(10)

Unnamed: 0,element_name,element,element_type,fixture,kickoff_time,was_home,team,opponent_team,minutes,total_points
0,Cech,0,0,0,2018-08-12T15:00:00Z,True,0,12,90,3
1,Cech,0,0,13,2018-08-18T16:30:00Z,False,0,5,90,3
2,Cech,0,0,20,2018-08-25T14:00:00Z,True,0,18,90,3
3,Cech,0,0,32,2018-09-02T12:30:00Z,False,0,4,90,1
4,Cech,0,0,45,2018-09-15T14:00:00Z,False,0,14,90,2
5,Cech,0,0,50,2018-09-23T15:00:00Z,True,0,7,90,11
6,Cech,0,0,60,2018-09-29T14:00:00Z,True,0,17,45,1
7,Cech,0,0,73,2018-10-07T11:00:00Z,False,0,8,0,0
8,Cech,0,0,80,2018-10-22T19:00:00Z,True,0,10,0,0
9,Cech,0,0,92,2018-10-28T13:30:00Z,False,0,6,0,0


Save to csv

In [10]:
final_table.to_csv(r'../../data/csv/data01.csv', index=False, index_label=False)

In [11]:
print(nrows)

21790


In [12]:
element_summary.shape

(21790, 11)

In [13]:
fixtures.shape

(380, 7)