In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'nba'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'nba.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [5]:
id_col = "Name"
target_col = "TARGET_5Yrs"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
394,Steve Burtt,47,8.9,4.2,1.5,4.0,38.3,0.0,0.0,0.0,...,1.6,68.8,0.2,0.4,0.6,0.4,0.5,0.1,0.7,0
881,Antoine Wright,39,9.5,1.8,0.7,2.1,35.8,0.0,0.4,6.7,...,0.6,50.0,0.2,0.6,0.8,0.3,0.1,0.1,0.5,1
358,Melvin Turpin,79,24.7,10.6,4.6,9.0,51.1,0.0,0.0,,...,1.8,78.4,2.0,3.8,5.7,0.5,0.5,1.1,1.5,1
367,Charles Jones,29,16.4,3.7,1.3,4.2,31.7,0.7,2.1,31.1,...,0.8,50.0,0.3,1.1,1.4,1.4,0.6,0.2,1.0,1
259,Michael Jackson,58,13.1,2.7,1.1,2.9,37.4,0.1,0.4,24.0,...,0.6,71.9,0.3,0.7,1.0,3.1,0.3,0.1,1.0,0


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(1206, 21) (134, 21)


# JSON inference request instance

In [11]:
instance = data_test.drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'3P Made': 0.1,
                '3P%': 30.0,
                '3PA': 0.2,
                'AST': 1.0,
                'BLK': 0.0,
                'DREB': 0.6,
                'FG%': 46.2,
                'FGA': 4.6,
                'FGM': 2.1,
                'FT%': 73.7,
                'FTA': 1.8,
                'FTM': 1.3,
                'GP': 54,
                'MIN': 12.4,
                'Name': 'Butch Carter',
                'OREB': 0.6,
                'PTS': 5.6,
                'REB': 1.2,
                'STL': 0.4,
                'TOV': 0.9}]}
