# Cleaning Fixtures

This will take the "fixtures.json" file and clean up for efficient use. The indices will be changed to begin at 0.

#### Import packages

In [1]:
# Import packages
import pandas as pd
import numpy as np
import json

# Notebook settings
pd.set_option('display.max_columns', None)

#### Read json file and convert to DataFrame

In [2]:
# Read fixtures json
with open('../../data/json/fixtures/fixtures.json', 'r') as f:
    fixtures_json = json.load(f)
    
# Convert json file to dateframe
fixtures = pd.DataFrame(fixtures_json)

fixtures.head()

Unnamed: 0,code,deadline_time,deadline_time_formatted,event,event_day,finished,finished_provisional,id,kickoff_time,kickoff_time_formatted,minutes,provisional_start_time,started,stats,team_a,team_a_difficulty,team_a_score,team_h,team_h_difficulty,team_h_score
0,987597,2018-08-10T18:00:00Z,10 Aug 19:00,1,1,True,True,6,2018-08-10T19:00:00Z,10 Aug 20:00,90,False,True,"[{'goals_scored': {'a': [{'value': 1, 'element...",11,4,1,14,3,2
1,987598,2018-08-10T18:00:00Z,10 Aug 19:00,1,2,True,True,7,2018-08-11T11:30:00Z,11 Aug 12:30,90,False,True,"[{'goals_scored': {'a': [{'value': 1, 'element...",17,3,2,15,4,1
2,987592,2018-08-10T18:00:00Z,10 Aug 19:00,1,2,True,True,2,2018-08-11T14:00:00Z,11 Aug 15:00,90,False,True,"[{'goals_scored': {'a': [], 'h': [{'value': 1,...",5,3,0,2,2,2
3,987594,2018-08-10T18:00:00Z,10 Aug 19:00,1,2,True,True,3,2018-08-11T14:00:00Z,11 Aug 15:00,90,False,True,"[{'goals_scored': {'a': [{'value': 1, 'element...",7,2,2,9,2,0
4,987595,2018-08-10T18:00:00Z,10 Aug 19:00,1,2,True,True,4,2018-08-11T14:00:00Z,11 Aug 15:00,90,False,True,"[{'goals_scored': {'a': [{'value': 1, 'element...",6,2,3,10,4,0


In [3]:
print(fixtures.shape)

(380, 20)


#### Clean up DataFrame
- Remove bloating columns
- Convert kickoff time to datetime
- Start indexing from 0

In [4]:
# Remove bloating columns
keep_columns = ['id', 'event', 'kickoff_time', 'team_h', 'team_a', 'team_h_score', 'team_a_score', 'team_h_difficulty', 'team_a_difficulty']
fixtures = fixtures[keep_columns]

# Convert kickoff time string to a datetime
fixtures['kickoff_time'] = pd.to_datetime(fixtures['kickoff_time'], utc=True)

# Sort rows by fixture id and reset indexing
fixtures = fixtures.sort_values(by=['id'])
fixtures = fixtures.reset_index(drop=True)

# Make sure team id begins counting from 0
fixtures['team_h'] = fixtures['team_h']-1
fixtures['team_a'] = fixtures['team_a']-1
fixtures['id'] = fixtures['id']-1

# View new DataFrame
fixtures.head()

Unnamed: 0,id,event,kickoff_time,team_h,team_a,team_h_score,team_a_score,team_h_difficulty,team_a_difficulty
0,0,1,2018-08-12 15:00:00+00:00,0,12,0,2,4,4
1,1,1,2018-08-11 14:00:00+00:00,1,4,2,0,2,3
2,2,1,2018-08-11 14:00:00+00:00,8,6,0,2,2,2
3,3,1,2018-08-11 14:00:00+00:00,9,5,0,3,4,2
4,4,1,2018-08-12 12:30:00+00:00,11,18,4,0,3,5


#### Save as csv

In [5]:
# Save csv
fixtures.to_csv(r'../../data/csv/fixtures.csv', index_label=False, index=False)
print(fixtures.shape)

(380, 9)


In [6]:
fixtures.iloc[list(range(0,4))+[len(fixtures)-1]].transpose().to_html('../../data/html/fixtures.html', bold_rows=False)

In [8]:
fixtures_nokot = fixtures.copy()
#fixtures_nokot['kickoff_time'] = pd.Series(['...']*len(fixtures_nokot))
fixtures_nokot.iloc[list(range(0,5))+[len(fixtures_nokot)-1]].transpose().to_html('../../data/html/fixtures.html', bold_rows=False)