In [12]:
import numpy as np 
import pandas as pd 
import os 
import json
import pprint

In [13]:
dataset_name = 'white_wine'

In [14]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'winequality-white.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [15]:
col_names = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
    "quality"  
]

In [16]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=col_names, skiprows=1, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [17]:
id_col = "id"
target_col = "quality"

# Shuffle Data

In [18]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4656,6.0,0.29,0.41,10.8,0.048,55.0,149.0,0.9937,3.09,0.59,10.966667,7
3659,5.4,0.53,0.16,2.7,0.036,34.0,128.0,0.98856,3.2,0.53,13.2,8
907,7.1,0.25,0.39,2.1,0.036,30.0,124.0,0.9908,3.28,0.43,12.2,8
4352,7.3,0.28,0.35,1.6,0.054,31.0,148.0,0.99178,3.18,0.47,10.7,5
3271,6.5,0.32,0.34,5.7,0.044,27.0,91.0,0.99184,3.28,0.6,12.0,7


# Insert Id Column

In [19]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

      id  fixed acidity  volatile acidity  citric acid  residual sugar  \
4656   0            6.0              0.29         0.41            10.8   
3659   1            5.4              0.53         0.16             2.7   
907    2            7.1              0.25         0.39             2.1   
4352   3            7.3              0.28         0.35             1.6   
3271   4            6.5              0.32         0.34             5.7   

      chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  \
4656      0.048                 55.0                 149.0  0.99370  3.09   
3659      0.036                 34.0                 128.0  0.98856  3.20   
907       0.036                 30.0                 124.0  0.99080  3.28   
4352      0.054                 31.0                 148.0  0.99178  3.18   
3271      0.044                 27.0                  91.0  0.99184  3.28   

      sulphates    alcohol  quality  
4656       0.59  10.966667        7  
3659       0.53 

# Save Main Data File

In [20]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [21]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(4408, 13) (490, 13)


# JSON inference request instance

In [22]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'alcohol': 11.0,
                'chlorides': 0.045,
                'citric acid': 0.49,
                'density': 0.9954,
                'fixed acidity': 7.9,
                'free sulfur dioxide': 48.0,
                'id': '4656',
                'pH': 3.04,
                'residual sugar': 7.7,
                'sulphates': 0.55,
                'total sulfur dioxide': 195.0,
                'volatile acidity': 0.28}]}
