In [1]:
import numpy as np
import pandas as pd
import os, sys

In [2]:
dataset_name = 'soybean_disease'
inp_fname = 'soybean_disease.csv'

In [3]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [4]:
cols = [
    'date', 'plant-stand', 'precip', 'temp', 'hail',
    'crop-hist', 'area-damaged', 'severity', 'seed-tmt', 'germination',
    'plant-growth', 'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
    'leaf-shred', 'leaf-malf', 'leaf-mild',' stem', 'lodging',
    'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external decay', 'mycelium',
    'int-discolor', 'sclerotia', 'fruit-pods', 'fruit-spots', 'seed',
    'mold-growth', 'seed-discolor', 'seed-size', 'shriveling', 'roots',
    'disease'
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,disease
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [6]:
id_col = "id"
target_col = "disease"

# Prepare Data

In [7]:
# replace ? which mean null
data = data.replace(' ?', np.nan)
data = data.replace('?', np.nan)

In [8]:
data.tail()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,disease
678,april,,,,,,upper-areas,,,,...,,,,,,,,,,2-4-d-injury
679,april,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
680,june,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
681,april,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury
682,june,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury


In [13]:
data[target_col].value_counts()

KeyError: 'class'

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             682 non-null    object
 1   plant-stand      647 non-null    object
 2   precip           645 non-null    object
 3   temp             653 non-null    object
 4   hail             562 non-null    object
 5   crop-hist        667 non-null    object
 6   area-damaged     682 non-null    object
 7   severity         562 non-null    object
 8   seed-tmt         562 non-null    object
 9   germination      571 non-null    object
 10  plant-growth     667 non-null    object
 11  leaves           683 non-null    object
 12  leafspots-halo   599 non-null    object
 13  leafspots-marg   599 non-null    object
 14  leafspot-size    599 non-null    object
 15  leaf-shred       583 non-null    object
 16  leaf-malf        599 non-null    object
 17  leaf-mild        575 non-null    ob

# Insert Id Column

In [10]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id     date plant-stand    precip   temp  hail          crop-hist  \
0   0  october      normal   gt-norm   norm   yes        same-lst-yr   
1   1   august      normal   gt-norm   norm   yes   same-lst-two-yrs   
2   2     july      normal   gt-norm   norm   yes        same-lst-yr   
3   3     july      normal   gt-norm   norm   yes        same-lst-yr   
4   4  october      normal   gt-norm   norm   yes   same-lst-two-yrs   

  area-damaged     severity    seed-tmt  ... sclerotia fruit-pods fruit-spots  \
0    low-areas   pot-severe        none  ...    absent       norm         dna   
1    scattered       severe   fungicide  ...    absent       norm         dna   
2    scattered       severe   fungicide  ...    absent       norm         dna   
3    scattered       severe        none  ...    absent       norm         dna   
4    scattered   pot-severe        none  ...    absent       norm         dna   

    seed mold-growth seed-discolor seed-size shriveling  roots  \
0   norm      

# Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)