# Label Encoder in Python

- Label Encoding will covert the labels into numeric form so machine can understand the data easily.

- If we had a dataset:
  - Cat   = 0
  - Dog   = 1
  - Bird  = 2
-This is how label encoder convert the whole categories available in data.
 

In [1]:
import numpy as np

In [2]:
a = np.array(['Cat','Dog','Bird'])

In [5]:
a

array(['Cat', 'Dog', 'Bird'], dtype='<U4')

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [10]:
a = le.fit_transform(a)

In [11]:
a

array([1, 2, 0], dtype=int64)

In [12]:
le.inverse_transform(a)

array(['Cat', 'Dog', 'Bird'], dtype='<U4')

In [13]:
# Label Encoder doesn't work on DataFrame or multiple columns at a time

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame({'Animal':['Cat','Dog','Bird'],
                  'Color':['Black','Brown','White']})

In [17]:
df

Unnamed: 0,Animal,Color
0,Cat,Black
1,Dog,Brown
2,Bird,White


In [19]:
le.fit_transform(df)

ValueError: y should be a 1d array, got an array of shape (3, 2) instead.

In [27]:
#So for converting multiple columns with label encoder we can use apply method


df = df[['Animal','Color']].apply(le.fit_transform)

In [28]:
df

Unnamed: 0,Animal,Color
0,1,0
1,2,1
2,0,2


In [34]:
import seaborn as sns

In [35]:
df = sns.load_dataset('iris')

In [36]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [37]:
df.species = le.fit_transform(df.species)

In [38]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


# Ordinal Encoder

In [39]:
df = pd.DataFrame({'Animal':['Cat','Dog','Bird'],
                  'Color':['Black','Brown','White']})

In [40]:
df

Unnamed: 0,Animal,Color
0,Cat,Black
1,Dog,Brown
2,Bird,White


In [41]:
from sklearn.preprocessing import OrdinalEncoder

In [42]:
en = OrdinalEncoder()

In [43]:
df = en.fit_transform(df)

In [44]:
df

array([[1., 0.],
       [2., 1.],
       [0., 2.]])

In [45]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [54]:
df = sns.load_dataset('diamonds')

In [55]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [56]:
df[['cut','color']] = en.fit_transform(df[['cut','color']])

In [57]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2.0,1.0,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3.0,1.0,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1.0,1.0,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3.0,5.0,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1.0,6.0,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,2.0,0.0,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,1.0,0.0,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,4.0,0.0,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,3.0,4.0,SI2,61.0,58.0,2757,6.15,6.12,3.74


# One Hot Encoding

In [74]:
a = np.array([['Cat'],['Dog'],['Bird']])

In [75]:
a

array([['Cat'],
       ['Dog'],
       ['Bird']], dtype='<U4')

In [76]:
from sklearn.preprocessing import OneHotEncoder

In [77]:
ohc = OneHotEncoder(sparse=False)

In [78]:
ohc

OneHotEncoder(sparse=False)

In [79]:
a = ohc.fit_transform(a)

In [80]:
a

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [106]:
df = sns.load_dataset('iris')

In [107]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [108]:
df.species.value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [109]:
ohc = OneHotEncoder(handle_unknown=False)

In [110]:
ohc

OneHotEncoder(handle_unknown=False)

In [118]:
df.species = ohc.fit_transform(df[['species']]).toarray()

ValueError: handle_unknown should be either 'error' or 'ignore', got False.

In [112]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [117]:
df.species = df.species.str.get_dummies()

ValueError: Columns must be same length as key

In [119]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [120]:
y = df.species

In [121]:
y

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [122]:
ohe = OneHotEncoder(sparse=False)

In [124]:
ohe

OneHotEncoder(sparse=False)

In [127]:
y = y.reshape(len(y), 1)

AttributeError: 'Series' object has no attribute 'reshape'

In [129]:
df = pd.get_dummies(df, columns=['species'])

In [130]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


In [131]:
df = sns.load_dataset('diamonds')

In [132]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [134]:
df[['cut','color']] = pd.DataFrame(ohe.fit_transform(df[['cut','color']]).toarray())

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [138]:
enc_data=pd.DataFrame(ohe.fit_transform(df[['cut','color']]).toarray())

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [139]:
from sklearn.preprocessing import LabelBinarizer

In [140]:
lb = LabelBinarizer()

In [141]:
df = sns.load_dataset('iris')

In [145]:
df.species = lb.fit_transform(df.species)

In [146]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0
146,6.3,2.5,5.0,1.9,0
147,6.5,3.0,5.2,2.0,0
148,6.2,3.4,5.4,2.3,0


In [150]:
df['species'] = OneHotEncoder().fit_transform(df[['species']]).toarray()

In [151]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1.0
146,6.3,2.5,5.0,1.9,1.0
147,6.5,3.0,5.2,2.0,1.0
148,6.2,3.4,5.4,2.3,1.0
