# FPL-ML-004: Preparing Data

## Import Packages

In [1]:
import json
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

## Teams

### Read in dataset

In [2]:
with open('../../data/json/teams/teams.json', 'r') as f:
    teams = pd.DataFrame(json.load(f))
    
teams.head()

Unnamed: 0,code,current_event_fixture,draw,form,id,link_url,loss,name,next_event_fixture,played,points,position,short_name,strength,strength_attack_away,strength_attack_home,strength_defence_away,strength_defence_home,strength_overall_away,strength_overall_home,team_division,unavailable,win
0,3,"[{'is_home': False, 'month': 5, 'event_day': 1...",0,,1,,0,Arsenal,[],0,0,0,ARS,4,1270,1240,1340,1310,1320,1260,1,False,0
1,91,"[{'is_home': False, 'month': 5, 'event_day': 1...",0,,2,,0,Bournemouth,[],0,0,0,BOU,3,1100,1040,1130,1120,1130,1030,1,False,0
2,36,"[{'is_home': True, 'month': 5, 'event_day': 1,...",0,,3,,0,Brighton,[],0,0,0,BHA,2,1140,1040,1070,1010,1050,1030,1,False,0
3,90,"[{'is_home': True, 'month': 5, 'event_day': 1,...",0,,4,,0,Burnley,[],0,0,0,BUR,3,1030,990,1040,1000,1100,1070,1,False,0
4,97,"[{'is_home': False, 'month': 5, 'event_day': 1...",0,,5,,0,Cardiff,[],0,0,0,CAR,2,1060,1030,1090,1020,1080,1030,1,False,0


### Save raw table to html for blog

In [3]:
ncols = 4
teams.iloc[list(range(ncols))+[len(teams)-1]].transpose().to_html('../../data/html/teams_raw.html', bold_rows=False)

### Look at column names

In [4]:
print(teams.columns)

Index(['code', 'current_event_fixture', 'draw', 'form', 'id', 'link_url',
       'loss', 'name', 'next_event_fixture', 'played', 'points', 'position',
       'short_name', 'strength', 'strength_attack_away',
       'strength_attack_home', 'strength_defence_away',
       'strength_defence_home', 'strength_overall_away',
       'strength_overall_home', 'team_division', 'unavailable', 'win'],
      dtype='object')


### Summary of fields
__code:__ not useful. <br>
__current_event_fixture:__ nested structure of the teams most recent fixture. not useful. <br>
__draw:__ not useful. <br>
__form:__ not useful. <br>
__id:__ the team id code. This is needed and ideally should match the row index. <br>
__link_url:__ not useful. <br>
__loss:__ not useful. <br>
__name:__ team name, need to keep this. <br>
__next_event_fixture:__ nested structure of the teams next fixture. not useful. <br>
__played:__ not useful. <br>
__points:__ not useful. <br>
__position:__ not useful. <br>
__short_name:__ three letter abreviation for the team. we should keep this.<br>
__strength*:__ descriptions of team ability in different situations.<br>
__team_division:__ not useful. <br>
__unavailable:__ not useful. <br>
__win:__ not useful.

### Keep only useful fields

In [5]:
keep_these_columns = ['id', 'name', 'short_name', 'strength',
                      'strength_defence_home', 'strength_attack_home', 'strength_overall_home',
                      'strength_defence_away', 'strength_attack_away', 'strength_overall_away']

teams_cleaned = teams.copy()
teams_cleaned = teams_cleaned[keep_these_columns]
teams_cleaned.head()

Unnamed: 0,id,name,short_name,strength,strength_defence_home,strength_attack_home,strength_overall_home,strength_defence_away,strength_attack_away,strength_overall_away
0,1,Arsenal,ARS,4,1310,1240,1260,1340,1270,1320
1,2,Bournemouth,BOU,3,1120,1040,1030,1130,1100,1130
2,3,Brighton,BHA,2,1010,1040,1030,1070,1140,1050
3,4,Burnley,BUR,3,1000,990,1070,1040,1030,1100
4,5,Cardiff,CAR,2,1020,1030,1030,1090,1060,1080


### Start team id index from 0

In [6]:
teams_cleaned['id'] = teams_cleaned['id']-1
teams_cleaned.head()

Unnamed: 0,id,name,short_name,strength,strength_defence_home,strength_attack_home,strength_overall_home,strength_defence_away,strength_attack_away,strength_overall_away
0,0,Arsenal,ARS,4,1310,1240,1260,1340,1270,1320
1,1,Bournemouth,BOU,3,1120,1040,1030,1130,1100,1130
2,2,Brighton,BHA,2,1010,1040,1030,1070,1140,1050
3,3,Burnley,BUR,3,1000,990,1070,1040,1030,1100
4,4,Cardiff,CAR,2,1020,1030,1030,1090,1060,1080


### Check if row index and id are equal

If the row index and id are equal, then in future we can go directly to the correct fixture by using the row index rather than searching for the right id.

In [7]:
teams_cleaned['id'].equals(pd.Series(teams_cleaned.index))

True

### Save new table to html for blog

In [10]:
teams_cleaned.iloc[list(range(ncols))+[len(teams_cleaned)-1]].transpose().to_html('../../data/html/teams_clean.html', bold_rows=False)

### Save table to csv for future work

In [9]:
teams_cleaned.to_csv(r'../../data/csv/teams.csv', index=False, index_label=False)

## Player Performance