# Data Processing

In [96]:
import numpy as np
import pandas as pd

In [97]:
df_orig = pd.DataFrame([['big', 2.00, 'black', 'coffee'],
                   ['medium', 1.90, 'black', 'coffee'],
                   ['small', 1.80, 'black', 'coffee'],
                   ['small', 1.20, 'brown', 'tea'],
                   ['medium', 1.30, 'brown', 'tea']], 
                   columns=['size', 'price', 'colour', 'drink'])
df_orig

Unnamed: 0,size,price,colour,drink
0,big,2.0,black,coffee
1,medium,1.9,black,coffee
2,small,1.8,black,coffee
3,small,1.2,brown,tea
4,medium,1.3,brown,tea


## Categorical Data
- **nominal**
- **ordinal**

## Encoding Norminal: `LabelEncoder`

In [120]:
df = df_orig.copy()

In [121]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['drink']) # y can now be used as labels
print(y[0:5])
class_le.classes_

[0 0 0 1 1]


array(['coffee', 'tea'], dtype=object)

## Encoding Norminal into One-Hot: `LabelEncoder`

In [133]:
df = df_orig.copy()
from sklearn.preprocessing import OneHotEncoder
colour_le = LabelEncoder()
colour_le.fit_transform(df['colour'].values)
colour_le.classes_

array(['black', 'brown'], dtype=object)

In [134]:
df['colour']

0    black
1    black
2    black
3    brown
4    brown
Name: colour, dtype: object

In [135]:
X = pd.get_dummies(df[['price', 'colour']])
X

Unnamed: 0,price,colour_black,colour_brown
0,2.0,1.0,0.0
1,1.9,1.0,0.0
2,1.8,1.0,0.0
3,1.2,0.0,1.0
4,1.3,0.0,1.0


## Hmm ...  Encoding Nominal: `OneHotEncoder` ??

In [136]:
df = df_orig.copy()

from sklearn.preprocessing import OneHotEncoder
colour_le = LabelEncoder()
df['colour'] = colour_le.fit_transform(df['colour'].values)
colour_le.classes_

array(['black', 'brown'], dtype=object)

In [137]:
df['colour']

0    0
1    0
2    0
3    1
4    1
Name: colour, dtype: int64

In [141]:
from sklearn.preprocessing import OneHotEncoder
#ohe = OneHotEncoder(categorical_features=[2], sparse=False) # feature column number 0. ['size'] won't work
#ohe.fit_transform(df['colour'].values)

ohe = OneHotEncoder(categorical_features=[0], sparse=True) # feature column number 0. ['size'] won't work
ohe.fit_transform(df['colour'].values).toarray()
X = pd.get_dummies(df[['price', 'colour']])
X



Unnamed: 0,price,colour
0,2.0,0
1,1.9,0
2,1.8,0
3,1.2,1
4,1.3,1


## Encoding Ordinal: `.map`
Order does has a meaning.

In [61]:
df = df_orig.copy()
size_map = {'big':3, 'medium':2, 'small':1}
df['size'] = df['size'].map(size_map)
df

Unnamed: 0,size,price,drink
0,3,2.0,coffee
1,2,1.9,coffee
2,1,1.8,coffee
3,1,1.2,tea
4,2,1.3,tea


## Below won't preserve the intended order!!

In [8]:
df = df_orig.copy()
np.unique(df['size'])

array(['big', 'medium', 'small'], dtype=object)

In [9]:
size_map = {category:idx for idx, category in enumerate(np.unique(df['size']))}
size_map

{'big': 0, 'medium': 1, 'small': 2}