### Cleaning fixtures data

In [6]:
# Import packages
import pandas as pd
import numpy as np
import json

# Notebook settings
pd.set_option('display.max_columns', None)

In [7]:
# Read fixtures json
with open('../data/json/fixtures/fixtures.json', 'r') as f:
    fixtures_json = json.load(f)

In [16]:
# Convert json file to dateframe
fixtures = pd.DataFrame(fixtures_json)

In [22]:
# Remove bloating columns
keep_columns = ['event', 'event_day', 'id', 'kickoff_time', 'team_h', 'team_a', 'team_h_score', 'team_a_score']
fixtures = fixtures[keep_columns]

# Convert kickoff time string to a datetime
fixtures['kickoff_time'] = pd.to_datetime(fixtures['kickoff_time'], utc=True)

# Sort rows by fixture id and reset indexing
fixtures = fixtures.sort_values(by=['id'])
fixtures = fixtures.reset_index(drop=True)

# Add home and way points gained
n_fixtures = len(fixtures)
team_h_points = [None] * n_fixtures
team_a_points = [None] * n_fixtures
for i in range(0,n_fixtures):
    if fixtures['team_h_score'][i] > fixtures['team_a_score'][i]:
        team_h_points[i] = 3
        team_a_points[i] = 0
    elif fixtures['team_h_score'][i] < fixtures['team_a_score'][i]:
        team_h_points[i] = 0
        team_a_points[i] = 3
    else:
        team_h_points[i] = 1
        team_a_points[i] = 1
fixtures['team_h_points'] = team_h_points
fixtures['team_a_points'] = team_a_points

fixtures.head(20)

Unnamed: 0,event,event_day,id,kickoff_time,team_h,team_a,team_h_score,team_a_score,team_h_points,team_a_points
0,1,3,1,2018-08-12 15:00:00+00:00,1,13,0,2,0,3
1,1,2,2,2018-08-11 14:00:00+00:00,2,5,2,0,3,0
2,1,2,3,2018-08-11 14:00:00+00:00,9,7,0,2,0,3
3,1,2,4,2018-08-11 14:00:00+00:00,10,6,0,3,0,3
4,1,3,5,2018-08-12 12:30:00+00:00,12,19,4,0,3,0
5,1,1,6,2018-08-10 19:00:00+00:00,14,11,2,1,3,0
6,1,2,7,2018-08-11 11:30:00+00:00,15,17,1,2,0,3
7,1,3,8,2018-08-12 12:30:00+00:00,16,4,0,0,1,1
8,1,2,9,2018-08-11 14:00:00+00:00,18,3,2,0,3,0
9,1,2,10,2018-08-11 16:30:00+00:00,20,8,2,2,1,1
