In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'mushroom'
inp_fname = 'dataset_24_mushroom.csv'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), quotechar="'")
data.head()

Unnamed: 0,"""cap-shape""","""cap-surface""","""cap-color""","""bruises%3F""","""odor""","""gill-attachment""","""gill-spacing""","""gill-size""","""gill-color""","""stalk-shape""",...,"""stalk-color-above-ring""","""stalk-color-below-ring""","""veil-type""","""veil-color""","""ring-number""","""ring-type""","""spore-print-color""","""population""","""habitat""","""class"""
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [5]:
id_col = "id"
target_col = "class"

# Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

   id "cap-shape" "cap-surface" "cap-color" "bruises%3F" "odor"  \
0   0           x             s           n            t      p   
1   1           x             s           y            t      a   
2   2           b             s           w            t      l   
3   3           x             y           w            t      p   
4   4           x             s           g            f      n   

  "gill-attachment" "gill-spacing" "gill-size" "gill-color"  ...  \
0                 f              c           n            k  ...   
1                 f              c           b            k  ...   
2                 f              c           b            n  ...   
3                 f              c           n            n  ...   
4                 f              w           b            k  ...   

  "stalk-color-above-ring" "stalk-color-below-ring" "veil-type" "veil-color"  \
0                        w                        w           p            w   
1                        w  

# Shuffle Data

In [7]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,"""cap-shape""","""cap-surface""","""cap-color""","""bruises%3F""","""odor""","""gill-attachment""","""gill-spacing""","""gill-size""","""gill-color""",...,"""stalk-color-above-ring""","""stalk-color-below-ring""","""veil-type""","""veil-color""","""ring-number""","""ring-type""","""spore-print-color""","""population""","""habitat""","""class"""
1971,1971,f,f,n,f,n,f,w,b,h,...,w,w,p,w,o,e,n,s,g,e
6654,6654,f,s,e,f,y,f,c,n,b,...,p,p,p,w,o,e,w,v,l,p
5606,5606,x,y,n,f,f,f,c,n,b,...,w,p,p,w,o,e,w,v,l,p
3332,3332,f,y,g,t,n,f,c,b,n,...,g,p,p,w,o,p,n,y,d,e
6988,6988,f,s,e,f,s,f,c,n,b,...,p,p,p,w,o,e,w,v,l,p


# Replace null indicators

In [8]:
# data = data.replace('?', np.nan)

# Update Class labels

mapping = {
    "+": "positive", 
    "-": "negative"
}

data['class'] = data['class'].map(mapping)
data.head()

In [9]:
new_headers = []
for header in data.columns: # data.columns is your list of headers
    header = header.strip('"') # Remove the quotes off each header
    new_headers.append(header) # Save the new strings without the quotes

data.columns = new_headers # Replace the old headers with the new list

data.head()

Unnamed: 0,id,cap-shape,cap-surface,cap-color,bruises%3F,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
1971,1971,f,f,n,f,n,f,w,b,h,...,w,w,p,w,o,e,n,s,g,e
6654,6654,f,s,e,f,y,f,c,n,b,...,p,p,p,w,o,e,w,v,l,p
5606,5606,x,y,n,f,f,f,c,n,b,...,w,p,p,w,o,e,w,v,l,p
3332,3332,f,y,g,t,n,f,c,b,n,...,g,p,p,w,o,p,n,y,d,e
6988,6988,f,s,e,f,s,f,c,n,b,...,p,p,p,w,o,e,w,v,l,p


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(7311, 24) (813, 24)


# JSON inference request instance

In [12]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'bruises%3F': 'f',
                'cap-color': 'y',
                'cap-shape': 'f',
                'cap-surface': 'y',
                'gill-attachment': 'f',
                'gill-color': 'p',
                'gill-size': 'b',
                'gill-spacing': 'c',
                'habitat': 'p',
                'id': '5885',
                'odor': 'f',
                'population': 'v',
                'ring-number': 'o',
                'ring-type': 'l',
                'spore-print-color': 'h',
                'stalk-color-above-ring': 'b',
                'stalk-color-below-ring': 'n',
                'stalk-root': 'b',
                'stalk-shape': 'e',
                'stalk-surface-above-ring': 'k',
                'stalk-surface-below-ring': 'k',
                'veil-color': 'w',
                'veil-type': 'p'}]}
