## Categorical features - encoding and distances

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder 
from sklearn.metrics import pairwise_distances

from scipy.spatial.distance import hamming, euclidean, pdist, squareform

## Soyabeen small dataset

A dataset of soybean plant observations, including information on plants infested by one of four diseases.

- Dataset contains 47 instances.
- Each instance represents a single plant.
- Characterized by 35 attributes.
- Attributes are categorical.
- Attributes mostly capture various symptoms like leaf spots, root rot, mold growth, seed damage, etc.

Citation:

- Michalski,R.. (1987). Soybean (Small). UCI Machine Learning Repository. https://doi.org/10.24432/C5DS3P.

First 2 features:

1. **Date**: The time when the soybean sample was collected. May be represented as a date or as the day of the year.
  
2. **Hail**: Indicates whether the plants have been affected by hail, generally a binary "yes" or "no."

3. **Germination**: Describes the rate of germination.

In [None]:
soybean_path = 'data/soybean/soybean_data_use.csv'

soy_df = pd.read_csv(soybean_path)

soy_df = soy_df.loc[:10, ['date', 'hail', 'germination']]

soy_df

In [None]:
soy_df['date'].unique()

In [None]:
soy_df['hail'].unique()

In [None]:
soy_df['germination'].unique()

### Hamming distance

In [None]:
soy_df.loc[:1, :]

In [None]:
hamming(soy_df.loc[0].to_numpy(), soy_df.loc[1].to_numpy())

In [None]:
dst = pdist(soy_df.to_numpy(), metric='hamming')
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

In [None]:
soy_df

In [None]:
or_encoder = OrdinalEncoder() 
soy_df_enc = or_encoder.fit_transform(soy_df)
soy_df_enc

In [None]:
dst = pdist(soy_df_enc, metric='hamming')
dst

In [None]:
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

In [None]:
dst_matrix1 = pairwise_distances(soy_df_enc, metric='hamming')
pd.DataFrame(dst_matrix)

In [None]:
np.array_equal(dst_matrix, dst_matrix1)

### Euclidean distance

In [None]:
oh_encoder = OneHotEncoder(sparse_output=False) 
soy_df_oh_enc = oh_encoder.fit_transform(soy_df)

In [None]:
soy_df.nunique()

In [None]:
soy_df_oh_enc

In [None]:
soy_df_oh_enc.shape

In [None]:
euclidean(soy_df_oh_enc[0,:], soy_df_oh_enc[1,:])

In [None]:
dst = pdist(soy_df_oh_enc, metric='euclidean')
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

In [None]:
dst_matrix1 = pairwise_distances(soy_df_oh_enc, metric='euclidean')
pd.DataFrame(dst_matrix)