# Imports

In [1]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/train.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/test.csv'
sub_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/sample_submission.csv'

In [46]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option("precision", 4)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="darkgrid")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

SEED = 2311

In [3]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

# EDA

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42748 entries, 0 to 42747
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   edible-poisonous      42748 non-null  object 
 1   cap-diameter          42748 non-null  float64
 2   cap-shape             42748 non-null  object 
 3   cap-color             42748 non-null  object 
 4   does-bruise-or-bleed  42748 non-null  object 
 5   gill-attachment       35808 non-null  object 
 6   gill-color            42748 non-null  object 
 7   stem-height           42748 non-null  float64
 8   stem-width            42748 non-null  float64
 9   stem-color            42748 non-null  object 
 10  has-ring              42748 non-null  object 
 11  ring-type             41029 non-null  object 
 12  habitat               42748 non-null  object 
 13  season                42748 non-null  object 
dtypes: float64(3), object(11)
memory usage: 4.6+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18321 entries, 0 to 18320
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   edible-poisonous      18321 non-null  object 
 1   cap-diameter          18321 non-null  float64
 2   cap-shape             18321 non-null  object 
 3   cap-color             18321 non-null  object 
 4   does-bruise-or-bleed  18321 non-null  object 
 5   gill-attachment       15377 non-null  object 
 6   gill-color            18321 non-null  object 
 7   stem-height           18321 non-null  float64
 8   stem-width            18321 non-null  float64
 9   stem-color            18321 non-null  object 
 10  has-ring              18321 non-null  object 
 11  ring-type             17569 non-null  object 
 12  habitat               18321 non-null  object 
 13  season                0 non-null      float64
dtypes: float64(4), object(10)
memory usage: 2.0+ MB


In [6]:
target = ['season']
features = [f for f in train.columns if f not in target]
num_features = ['cap-diameter', 'stem-height', 'stem-width']
cat_features = [f for f in features if f not in num_features]
test.drop(['season'], axis=1, inplace=True)

In [7]:
train[num_features].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,42748.0,42748.0,42748.0
mean,6.712,6.5804,12.134
std,5.2038,3.3684,10.0589
min,0.38,0.0,0.0
25%,3.48,4.64,5.19
50%,5.84,5.95,10.14
75%,8.54,7.74,16.53
max,62.34,33.92,103.91


In [8]:
train[cat_features].describe()

Unnamed: 0,edible-poisonous,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-color,has-ring,ring-type,habitat
count,42748,42748,42748,42748,35808,42748,42748,42748,41029,42748
unique,2,7,12,2,7,12,13,2,8,8
top,p,x,n,f,a,w,w,f,f,d
freq,23707,18884,16940,35335,8874,12930,16014,32138,33864,30934


# Missing values

In [9]:
train.isna().sum()

edible-poisonous           0
cap-diameter               0
cap-shape                  0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         6940
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type               1719
habitat                    0
season                     0
dtype: int64

In [10]:
test.isna().sum()

edible-poisonous           0
cap-diameter               0
cap-shape                  0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         2944
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type                752
habitat                    0
dtype: int64

### ring-type

Ring exists but type is missing:

In [11]:
len(train.loc[(train['has-ring'] == 't') & train['ring-type'].isna()])

1719

In [12]:
len(test.loc[(test['has-ring'] == 't') & test['ring-type'].isna()])

752

Two options:  
1. Fill all missing *ring-type* with 'f', which corresponds to 'none'  
2. Remove rows with missing *ring-type* when *has-ring* is 't', fill with 'f' when *has-ring* is 'f'. (Not possible on test set)

In [13]:
test['ring-type'].fillna('f', inplace=True)

In [14]:
train_full = train.copy()
train_full['ring-type'].fillna('f', inplace=True)

In [15]:
train_reduced = train.drop(train[(train['has-ring'] == 't') & train['ring-type'].isna()].index)
train_reduced['ring-type'].fillna('f', inplace=True)

### gill-attachment  

*gill-attachment* is 'none' (== 'f') implies that the mushroom/fruit does not have gills. ([Source](https://www.zoology.ubc.ca/~biodiv/mushroom/gill_attachment.html))  

We can confirm that information from our data:

In [16]:
len(train.loc[(train['gill-attachment'] == 'f') & (train['gill-color'] != 'f')])

0

Checking *gill-color* for missing *gill-attachment* values:

In [17]:
len(train.loc[(train['gill-color'] == 'f') & 
              train['gill-attachment'].isna()])

0

In [18]:
train.loc[train['gill-attachment'].isna()]['gill-color'].value_counts()

w    1908
n    1310
k    1056
g     815
y     568
r     293
b     247
e     242
u     202
p     174
o     125
Name: gill-color, dtype: int64

In [19]:
len(test.loc[(test['gill-color'] == 'f') & 
              test['gill-attachment'].isna()])

0

In [20]:
test.loc[test['gill-attachment'].isna()]['gill-color'].value_counts()

w    839
n    535
k    438
g    321
y    240
r    129
e    111
b    106
p     90
u     89
o     46
Name: gill-color, dtype: int64

In [21]:
test['gill-attachment'].fillna('m', inplace=True) #m -> missing

In [22]:
train_full['gill-attachment'].fillna('m', inplace=True)

train_reduced.drop(
    train_reduced[train_reduced['gill-attachment'].isna() & 
                  (train_reduced['gill-color'] != 'f')].index, 
    inplace=True
)
train_reduced['gill-attachment'].fillna('m', inplace=True)

# Encoding

In [23]:
target_labels = LabelEncoder()

train_full['season'] = target_labels.fit_transform(train_full['season'])
train_reduced['season'] = target_labels.transform(train_reduced['season'])

In [24]:
train.head()

Unnamed: 0,edible-poisonous,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,p,5.75,x,n,t,s,w,6.13,14.93,w,f,f,d,a
1,p,2.27,o,g,f,f,f,0.0,0.0,f,f,f,d,s
2,p,5.59,c,y,t,x,p,5.47,11.27,w,f,f,d,a
3,p,0.74,x,p,f,a,n,4.33,0.98,k,f,f,d,a
4,p,9.6,x,n,f,a,w,10.84,28.52,w,f,f,d,u


In [35]:
edible_mapping = {'e': 0, 'p': 1}

caps = ['x', 'o', 'c', 'f', 'b', 's', 'p']
cap_mapping = dict(zip(caps, np.arange(len(caps))))

colors= ['f', 'n', 'g', 'y', 'p', 'w', 'o', 'l', 'r', 'e', 'b', 'k', 'u']
color_mapping = dict(zip(colors, np.arange(len(colors))))

bruise_mapping = {'f': 0, 't': 1}

gills = ['f', 's', 'x', 'a', 'e', 'd', 'p']
gill_mapping = dict(zip(gills, np.arange(len(gills))))
gill_mapping['m'] = -1

hasring_mapping = {'f': 0, 't': 1}

ringtypes = ['f', 'p', 'l', 'z', 'g', 'm', 'e', 'r', 'c', 'y', 's']
ringtype_mapping = dict(zip(ringtypes, np.arange(len(ringtypes))))

habitats = ['d', 'l', 'g', 'w', 'm', 'p', 'h', 'u']
habitat_mapping = dict(zip(habitats, np.arange(len(habitats))))

In [37]:
def encode(column, mapping):
  train_full[column] = train_full[column].map(mapping).fillna(train_full[column])
  train_reduced[column] = train_reduced[column].map(mapping).fillna(train_reduced[column])
  test[column] = test[column].map(mapping).fillna(test[column])

In [38]:
encode('edible-poisonous', edible_mapping)
encode('cap-shape', cap_mapping)
encode('cap-color', color_mapping)
encode('does-bruise-or-bleed', bruise_mapping)
encode('gill-attachment', gill_mapping)
encode('gill-color', color_mapping)
encode('stem-color', color_mapping)
encode('has-ring', hasring_mapping)
encode('ring-type', ringtype_mapping)
encode('habitat', habitat_mapping)

# Folds

In [47]:
N_SPLITS = 5

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

In [48]:
train_full['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(X=train_full, y=train_full['season'])):
  train_full.loc[val_idx, 'fold'] = fold

In [50]:
train_reduced.reset_index(inplace=True)
train_reduced['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(X=train_reduced, y=train_reduced['season'])):
  train_reduced.loc[val_idx, 'fold'] = fold

# Final

In [51]:
train.shape, train_full.shape, train_reduced.shape

((42748, 14), (42748, 15), (34333, 16))

In [52]:
test.shape

(18321, 13)

In [53]:
train_full.sample(5)

Unnamed: 0,edible-poisonous,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,fold
39179,0,8.91,0,4,0,2,5,6.8,15.21,5,0,0,0,2,3
17953,1,3.99,0,3,0,3,5,4.93,3.55,3,1,7,2,0,2
9409,1,13.28,0,6,0,3,6,13.57,22.94,6,1,7,0,2,1
25831,1,3.81,0,9,0,2,5,4.07,6.25,5,0,0,0,0,3
21248,0,4.5,1,1,0,0,0,6.95,53.56,1,0,0,1,2,3


In [54]:
test.sample(5)

Unnamed: 0,edible-poisonous,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat
8829,0,4.74,3,2,0,1,2,3.82,6.83,5,0,0,0
18224,1,0.67,3,6,0,5,6,4.16,1.01,3,0,0,2
6645,0,13.97,0,1,1,4,5,9.69,18.28,5,1,0,0
10934,0,6.4,2,1,0,4,5,12.6,13.2,5,0,0,0
11853,0,8.81,0,5,0,4,4,5.07,12.25,5,1,6,4


In [55]:
train_full.to_csv('train_full.csv', index=False)
train_reduced.to_csv('train_reduced.csv', index=False)
test.to_csv('test.csv', index=False)