table of contents
1. Prepare
2. Data Import, Encoding
3. EDA
4. write df

# Prepare

## Import Library

In [1]:
import os, random, time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

cmap = plt.get_cmap('tab10')

## Change matplotlib setting

In [2]:
import matplotlib as mpl

mode = 'go_ando'

if mode == 'go_ando':
    # go ando mode
    mpl.rcParams['xtick.color'] = mpl.rcParams['ytick.color'] = mpl.rcParams['axes.labelcolor'] =  'darkgrey'
    mpl.rcParams['grid.color'] = mpl.rcParams['axes.edgecolor'] = mpl.rcParams['text.color']  = 'darkgrey'
    mpl.rcParams['axes.spines.top'] = mpl.rcParams['axes.spines.right'] = False
    mpl.rcParams['axes.grid.axis'] = 'y'
    mpl.rcParams['axes.grid'] = True
    mpl.rcParams['axes.facecolor'] = mpl.rcParams['figure.facecolor'] = 'white'
elif mode == 'twitter':
    # twitter mode
    mpl.rcParams['xtick.color'] = mpl.rcParams['ytick.color'] = mpl.rcParams['axes.labelcolor'] =  'white'
    mpl.rcParams['grid.color'] = mpl.rcParams['axes.edgecolor'] = mpl.rcParams['text.color']  = 'white'
    mpl.rcParams['axes.spines.top'] = mpl.rcParams['axes.spines.right'] = False
    mpl.rcParams['axes.grid.axis'] = 'y'
    twitter_color = '#141d26'
    mpl.rcParams['axes.facecolor'] = mpl.rcParams['figure.facecolor'] = twitter_color
    
else:
    pass

## define parameter

In [3]:
base_dir = os.path.join('..','..')
data_dir = os.path.join(base_dir, 'data')
raw_dir = os.path.join(data_dir, 'raw')
processed_dir = os.path.join(data_dir, 'processed')

infile_list = os.listdir(raw_dir)
for i, infile in enumerate(infile_list):
    print(i, infile)

0 referees.json
1 teams.json
2 matches
3 competitions.json
4 players.json
5 coaches.json


# Data Import, Encoding

## Data Import

In [4]:
df = pd.read_json(os.path.join(raw_dir, infile_list[5]))

## check data

In [5]:
df.head()

Unnamed: 0,wyId,shortName,firstName,middleName,lastName,birthDate,birthArea,passportArea,currentTeamId
0,275580,Cesar Domingo,César Domingo,,Mendiondo López,,"{'id': 76, 'alpha2code': 'BR', 'alpha3code': '...","{'id': 76, 'alpha2code': 'BR', 'alpha3code': '...",0
1,14710,J. Heynckes,Josef,,Heynckes,1945-05-09,"{'id': 276, 'alpha2code': 'DE', 'alpha3code': ...","{'id': 276, 'alpha2code': 'DE', 'alpha3code': ...",0
2,135480,G. De Biasi,Giovanni,,De Biasi,1956-06-16,"{'id': 380, 'alpha2code': 'IT', 'alpha3code': ...","{'id': 380, 'alpha2code': 'IT', 'alpha3code': ...",0
3,210074,Marcelino,Marcelino,,García Toral,1965-08-14,"{'id': 724, 'alpha2code': 'ES', 'alpha3code': ...","{'id': 724, 'alpha2code': 'ES', 'alpha3code': ...",674
4,293398,T. Korkut,Tayfun,,Korkut,1974-04-02,"{'id': 276, 'alpha2code': 'DE', 'alpha3code': ...","{'id': 792, 'alpha2code': 'TR', 'alpha3code': ...",0


In [6]:
df.dtypes.to_frame()

Unnamed: 0,0
wyId,int64
shortName,object
firstName,object
middleName,object
lastName,object
birthDate,object
birthArea,object
passportArea,object
currentTeamId,int64


## Encoding

In [7]:
df['birthArea'] = df.birthArea.map(lambda x: x['name'])
df['passportArea'] = df.passportArea.map(lambda x: x['name'])

In [8]:
df.head()

Unnamed: 0,wyId,shortName,firstName,middleName,lastName,birthDate,birthArea,passportArea,currentTeamId
0,275580,Cesar Domingo,César Domingo,,Mendiondo López,,Brazil,Brazil,0
1,14710,J. Heynckes,Josef,,Heynckes,1945-05-09,Germany,Germany,0
2,135480,G. De Biasi,Giovanni,,De Biasi,1956-06-16,Italy,Italy,0
3,210074,Marcelino,Marcelino,,García Toral,1965-08-14,Spain,Spain,674
4,293398,T. Korkut,Tayfun,,Korkut,1974-04-02,Germany,Turkey,0


# EDA

In [16]:
print(f'Number of coaches = {len(df.shortName.unique())}')
print(f'Number of rows = {len(df)}')

Number of coaches = 205
Number of rows = 208


In [12]:
summary = df.groupby(['firstName','lastName']).size()
# df[df.wyId.isin(summary[summary.values==2].index.tolist())]

In [15]:
summary[summary.values==2]

firstName  lastName
Antonio    Conte       2
Fabio      Pecchia     2
dtype: int64

# write df

In [11]:
df.to_csv(os.path.join(processed_dir, infile_list[5].split('.')[0]+'.csv'), index=False)