In [1]:
import numpy as np
import pandas as pd
import os, sys
import json
import pprint

In [2]:
dataset_name = 'soybean_disease'
inp_fname = 'soybean_disease.csv'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [
    'date', 'plant-stand', 'precip', 'temp', 'hail',
    'crop-hist', 'area-damaged', 'severity', 'seed-tmt', 'germination',
    'plant-growth', 'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
    'leaf-shred', 'leaf-malf', 'leaf-mild',' stem', 'lodging',
    'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external decay', 'mycelium',
    'int-discolor', 'sclerotia', 'fruit-pods', 'fruit-spots', 'seed',
    'mold-growth', 'seed-discolor', 'seed-size', 'shriveling', 'roots',
    'class'
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [6]:
id_col = "id"
target_col = "class"

# Prepare Data

In [7]:
# replace ? which mean null
data = data.replace(' ?', np.nan)

In [8]:
data.tail()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
678,april,,,,,,upper-areas,,,,...,,,,,,,,,,2-4-d-injury
679,april,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
680,june,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
681,april,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury
682,june,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             683 non-null    object
 1   plant-stand      647 non-null    object
 2   precip           645 non-null    object
 3   temp             653 non-null    object
 4   hail             562 non-null    object
 5   crop-hist        667 non-null    object
 6   area-damaged     682 non-null    object
 7   severity         562 non-null    object
 8   seed-tmt         562 non-null    object
 9   germination      571 non-null    object
 10  plant-growth     667 non-null    object
 11  leaves           683 non-null    object
 12  leafspots-halo   599 non-null    object
 13  leafspots-marg   599 non-null    object
 14  leafspot-size    599 non-null    object
 15  leaf-shred       583 non-null    object
 16  leaf-malf        599 non-null    object
 17  leaf-mild        575 non-null    ob

# Insert Id Column

In [10]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id     date plant-stand    precip   temp  hail          crop-hist  \
0   0  october      normal   gt-norm   norm   yes        same-lst-yr   
1   1   august      normal   gt-norm   norm   yes   same-lst-two-yrs   
2   2     july      normal   gt-norm   norm   yes        same-lst-yr   
3   3     july      normal   gt-norm   norm   yes        same-lst-yr   
4   4  october      normal   gt-norm   norm   yes   same-lst-two-yrs   

  area-damaged     severity    seed-tmt  ... sclerotia fruit-pods fruit-spots  \
0    low-areas   pot-severe        none  ...    absent       norm         dna   
1    scattered       severe   fungicide  ...    absent       norm         dna   
2    scattered       severe   fungicide  ...    absent       norm         dna   
3    scattered       severe        none  ...    absent       norm         dna   
4    scattered   pot-severe        none  ...    absent       norm         dna   

    seed mold-growth seed-discolor seed-size shriveling  roots  \
0   norm      

# Shuffle Data

In [11]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
292,292,september,normal,gt-norm,gt-norm,,same-lst-sev-yrs,whole-field,,,...,absent,diseased,brown-w/blk-specks,abnorm,present,present,lt-norm,present,,diaporthe-pod-&-stem-blight
145,145,june,normal,gt-norm,norm,yes,same-lst-two-yrs,upper-areas,pot-severe,none,...,absent,norm,absent,norm,absent,absent,norm,absent,norm,brown-spot
211,211,august,normal,norm,norm,yes,same-lst-sev-yrs,whole-field,pot-severe,none,...,absent,norm,absent,norm,absent,absent,norm,absent,norm,alternarialeaf-spot
118,118,may,normal,gt-norm,norm,yes,same-lst-two-yrs,whole-field,pot-severe,fungicide,...,absent,norm,absent,norm,absent,absent,norm,absent,norm,brown-spot
302,302,?,,,,,,,,,...,,,,,,,,,,2-4-d-injury


# Save Main Data File

In [12]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [13]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(614, 37) (69, 37)


# JSON inference request instance

In [14]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{' stem': ' norm',
                'area-damaged': ' whole-field',
                'canker-lesion': ' dna',
                'crop-hist': ' same-lst-yr',
                'date': 'june',
                'external decay': ' absent',
                'fruit-pods': ' norm',
                'fruit-spots': ' absent',
                'fruiting-bodies': ' absent',
                'germination': ' 80-89',
                'hail': ' yes',
                'id': '446',
                'int-discolor': ' none',
                'leaf-malf': ' absent',
                'leaf-mild': ' absent',
                'leaf-shred': ' absent',
                'leafspot-size': ' gt-1/8',
                'leafspots-halo': ' no-yellow-halos',
                'leafspots-marg': ' w-s-marg',
                'leaves': ' abnorm',
                'lodging': ' yes',
                'mold-growth': ' absent',
                'mycelium': ' absent',
                'plant-growth': ' norm',
                'plant-st