# Cleaning Fixtures

This will take the "fixtures.json" file and clean up for efficient use. The indices will be changed to begin at 0.

#### Import packages

In [1]:
# Import packages
import pandas as pd
import numpy as np
import json

# Notebook settings
pd.set_option('display.max_columns', None)

#### Read json file and convert to DataFrame

In [2]:
# Read fixtures json
with open('../../data/json/fixtures/fixtures.json', 'r') as f:
    fixtures_json = json.load(f)
    
# Convert json file to dateframe
fixtures = pd.DataFrame(fixtures_json)

#### Clean up DataFrame
- Remove bloating columns
- Convert kickoff time to datetime
- Start indexing from 0

In [3]:
# Remove bloating columns
keep_columns = ['id', 'kickoff_time', 'team_h', 'team_a', 'team_h_score', 'team_a_score']
fixtures = fixtures[keep_columns]

# Convert kickoff time string to a datetime
fixtures['kickoff_time'] = pd.to_datetime(fixtures['kickoff_time'], utc=True)

# Sort rows by fixture id and reset indexing
fixtures = fixtures.sort_values(by=['id'])
fixtures = fixtures.reset_index(drop=True)

# Make sure team id begins counting from 0
fixtures['team_h'] = fixtures['team_h']-1
fixtures['team_a'] = fixtures['team_a']-1
fixtures['id'] = fixtures['id']-1

# View new DataFrame
fixtures.head()

Unnamed: 0,id,kickoff_time,team_h,team_a,team_h_score,team_a_score
0,0,2018-08-12 15:00:00+00:00,0,12,0,2
1,1,2018-08-11 14:00:00+00:00,1,4,2,0
2,2,2018-08-11 14:00:00+00:00,8,6,0,2
3,3,2018-08-11 14:00:00+00:00,9,5,0,3
4,4,2018-08-12 12:30:00+00:00,11,18,4,0


#### Save as csv

In [4]:
# Save csv
fixtures.to_csv(r'../../data/csv/fixtures.csv', index_label=False, index=False)