# Handling Categorical data

In [2]:
import pandas as pd

# color, size, price, classlabel
df = pd.DataFrame([
        ['green', 'S', 10.5, 'class1'],
        ['red','L', 14, 'class2'],
        ['orange', 'M', 12.5, 'class3'],
        ['yellow','XL', 17.3, 'class2']])
df.columns = ['color', 'size', 'price', 'class']
df

Unnamed: 0,color,size,price,class
0,green,S,10.5,class1
1,red,L,14.0,class2
2,orange,M,12.5,class3
3,yellow,XL,17.3,class2


# If column labels are known and limited, then we can encode them as shown below (Map size column to numbers): 

In [3]:
size_mapping = {
    'S': 1,
    'M': 2,
    'L': 3,
    'XL': 4
}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,class
0,green,1,10.5,class1
1,red,3,14.0,class2
2,orange,2,12.5,class3
3,yellow,4,17.3,class2


# Reverse mapping of size column back to its values

In [4]:
# create inverse mapping of size
inv_size_mapping = {v:k for k,v in size_mapping.items()}
print "Reverse of size mapping:", inv_size_mapping

# apply reverse size mapping to get back original df
df['size'] = df['size'].map(inv_size_mapping)
df

Reverse of size mapping: {1: 'S', 2: 'M', 3: 'L', 4: 'XL'}


Unnamed: 0,color,size,price,class
0,green,S,10.5,class1
1,red,L,14.0,class2
2,orange,M,12.5,class3
3,yellow,XL,17.3,class2


# If we dont know how many labels are present in the column, then encode tham as shown below (class label is mapped):

In [5]:
import numpy as np

# iterate over unique class labels and assign numbers to them sequentially
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['class']))}
print class_mapping

# apply class_mapping on df
df['size'] = df['size'].map(size_mapping)
df['class'] = df['class'].map(class_mapping)
df

# reverse map class in df
inv_class_mapping = {v: k for k,v in class_mapping.items()}
df['class'] = df['class'].map(inv_class_mapping)
print df

{'class2': 1, 'class3': 2, 'class1': 0}
    color  size  price   class
0   green     1   10.5  class1
1     red     3   14.0  class2
2  orange     2   12.5  class3
3  yellow     4   17.3  class2


# Using label encoder to encode labels

In [10]:
from sklearn.preprocessing import LabelEncoder

# perform transformation on df1 (copy of df)
df1 = df

# transform class and color using labelencoder
class_le = LabelEncoder()
color_le = LabelEncoder() # have separate encoder for each column
df1['class'] = class_le.fit_transform(df1['class'].values)
df1['color'] = color_le.fit_transform(df1['color'].values)
print "\n\nclass and color transformed df::::\n", df1

# reverse transofrm class using label encoder
df1['class'] = class_le.inverse_transform(df1['class'])
print "\n\nclass reverse transformed df:\n", df1



class and color transformed df::::
   color  size  price  class
0      0     1   10.5      0
1      2     3   14.0      1
2      1     2   12.5      2
3      3     4   17.3      1


class reverse transformed df:
   color  size  price   class
0      0     1   10.5  class1
1      2     3   14.0  class2
2      1     2   12.5  class3
3      3     4   17.3  class2


# what's the problem with above method to encode features?
Although the color values don't come in any particular order, a learning algorithm will now assume that green is larger than blue, and red is larger than green. As this assumption is incorrect, we'll not get optimal results..Solution is:

# One-hot Encoding
Here, we would convert the color feature into three new features: blue, green, and red. Binary values can then be used to indicate the particular color of a sample; for example, a blue sample can be encoded as blue=1, green=0, red=0

In [11]:
from sklearn.preprocessing import OneHotEncoder

# transform class label to numeric and perform one-hot-encoding on it
df1['class'] = class_le.fit_transform(df1['class'].values)

# generates sparse matrix. So, use toarray()
sparse_ohe = OneHotEncoder(categorical_features=[0])
X = sparse_ohe.fit_transform(df1).toarray()
print X

# generates dense matrix. So, no need to use toarray.
dense_ohe = OneHotEncoder(categorical_features=[0],sparse=False)
X1 = dense_ohe.fit_transform(df1)
print X1


[[  1.    0.    0.    0.    1.   10.5   0. ]
 [  0.    0.    1.    0.    3.   14.    1. ]
 [  0.    1.    0.    0.    2.   12.5   2. ]
 [  0.    0.    0.    1.    4.   17.3   1. ]]
[[  1.    0.    0.    0.    1.   10.5   0. ]
 [  0.    0.    1.    0.    3.   14.    1. ]
 [  0.    1.    0.    0.    2.   12.5   2. ]
 [  0.    0.    0.    1.    4.   17.3   1. ]]


# Note: OneHotEncoding can be done only on numeric or boolean columns. 
If we want to convert string columns into numeric values, use get_dummies() method as shown below. 

In [98]:
pd.get_dummies(df[['price', 'color', 'size']])
df

Unnamed: 0,color,size,price,class
0,0,1,10.5,0
1,2,3,14.0,1
2,1,2,12.5,2
3,3,4,17.3,1
