### Cleaning fixtures data

In [1]:
# Import packages
import pandas as pd
import numpy as np
import json

# Notebook settings
pd.set_option('display.max_columns', None)

In [2]:
# Read fixtures json
with open('../data/json/fixtures/fixtures.json', 'r') as f:
    fixtures_json = json.load(f)

In [3]:
# Convert json file to dateframe
fixtures = pd.DataFrame(fixtures_json)

In [4]:
# Remove bloating columns
keep_columns = ['id', 'event', 'kickoff_time', 'team_h', 'team_a', 'team_h_score', 'team_a_score']
fixtures = fixtures[keep_columns]

# Convert kickoff time string to a datetime
fixtures['kickoff_time'] = pd.to_datetime(fixtures['kickoff_time'], utc=True)

# Sort rows by fixture id and reset indexing
fixtures = fixtures.sort_values(by=['id'])
fixtures = fixtures.reset_index(drop=True)

# Make sure team id begins counting from 0
fixtures['team_h'] = fixtures['team_h']-1
fixtures['team_a'] = fixtures['team_a']-1

# Add home and way points gained
n_fixtures = len(fixtures)
team_h_points = [None] * n_fixtures
team_a_points = [None] * n_fixtures
for i in range(0,n_fixtures):
    if fixtures['team_h_score'][i] > fixtures['team_a_score'][i]:
        team_h_points[i] = 3
        team_a_points[i] = 0
    elif fixtures['team_h_score'][i] < fixtures['team_a_score'][i]:
        team_h_points[i] = 0
        team_a_points[i] = 3
    else:
        team_h_points[i] = 1
        team_a_points[i] = 1
fixtures['team_h_points'] = team_h_points
fixtures['team_a_points'] = team_a_points

fixtures = fixtures.drop(columns=['id'])
fixtures.head(20)

Unnamed: 0,event,kickoff_time,team_h,team_a,team_h_score,team_a_score,team_h_points,team_a_points
0,1,2018-08-12 15:00:00+00:00,0,12,0,2,0,3
1,1,2018-08-11 14:00:00+00:00,1,4,2,0,3,0
2,1,2018-08-11 14:00:00+00:00,8,6,0,2,0,3
3,1,2018-08-11 14:00:00+00:00,9,5,0,3,0,3
4,1,2018-08-12 12:30:00+00:00,11,18,4,0,3,0
5,1,2018-08-10 19:00:00+00:00,13,10,2,1,3,0
6,1,2018-08-11 11:30:00+00:00,14,16,1,2,0,3
7,1,2018-08-12 12:30:00+00:00,15,3,0,0,1,1
8,1,2018-08-11 14:00:00+00:00,17,2,2,0,3,0
9,1,2018-08-11 16:30:00+00:00,19,7,2,2,1,1


In [5]:
# Save csv
fixtures.to_csv(r'../data/csv/fixtures.csv', index_label=False)