## In this practice session, we will learn how to encode categorical features with higher cardinality

## Some of the techniques covered are as follows

  * **Label Encoder**
  * **Ordinal Encoder**
  * **Label Binarizer**
  * **Count Encoding**

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn --user -q

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('melb_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Get list of categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
df[object_cols].nunique()

In [None]:
features = df[['Type','Method','Regionname']]
features.head()

In [None]:
features.Type.value_counts()

In [None]:
features.Method.value_counts()

In [None]:
features.Regionname.value_counts()

In [None]:
mapping ={'h':1,
           'u':2,
           't':3
          }
features['type'] = features.Type.map(mapping) 

In [None]:
features.type.value_counts()

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1 = features[['Regionname']]
df1['Region'] = le.fit_transform(features['Regionname'].astype(str).tolist())

In [None]:
df1['Region'].value_counts()

In [None]:
#one-hot-encoding
df2 = pd.get_dummies(features['Method'])

In [None]:
df2

## Ordinal Encoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
features['Type_ord'] = pd.DataFrame(oe.fit_transform(features[['Type']]))


In [None]:
features.Type_ord.value_counts()

In [None]:
features.head()

## Label Binarizer

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(features["Type"])

In [None]:
pd.DataFrame(lb_results, columns=lb_style.classes_).nunique()

## Count Encoding

In [None]:
df_frequency_map = features.Type.value_counts().to_dict()
features.Type = features.Type.map(df_frequency_map)

In [None]:
features.Type.iloc[20:23]

In [None]:
features.head()