In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
data = pd.read_csv("mercedes_benz.csv", usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [11]:
#To find the total number of unique items in each column
for col in data.columns:
    print(col, ': ', len(data[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [13]:
#lets see how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data, drop_first = True).shape

(4209, 117)

In [14]:
#We can see that from just 6 categorical variables, we end up with 117 new variables(columns)
#what can we do instead?

data.X2.value_counts().sort_values(ascending=False).head(20)
#Here we will take only first 10 variables which are occuring more number of times [first 10 from below list].

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [15]:
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [18]:
#onehotencoder practice

df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
df

Unnamed: 0,country
0,russia
1,germany
2,australia
3,korea
4,germany


In [19]:
pd.get_dummies(df,prefix=['country'])

Unnamed: 0,country_australia,country_germany,country_korea,country_russia
0,0,0,0,1
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,0,1,0,0


In [22]:
#One-hot encoding vs Dummy variables
#By default, the get_dummies() does not do dummy encoding, but one-hot encoding.

#To produce an actual dummy encoding from your data, use drop_first=True (not that 'australia' is missing from the columns)

import pandas as pd

# using the same example as above
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
df

Unnamed: 0,country
0,russia
1,germany
2,australia
3,korea
4,germany


In [23]:
pd.get_dummies(df,prefix=['country'], drop_first=True)

Unnamed: 0,country_germany,country_korea,country_russia
0,0,0,1
1,1,0,0
2,0,0,0
3,0,1,0
4,1,0,0


In [24]:
#LabelEncoder

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])

labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
bridge_df['Bridge_Types_Cat'] = labelencoder.fit_transform(bridge_df['Bridge_Types'])
bridge_df

Unnamed: 0,Bridge_Types,Bridge_Types_Cat
0,Arch,0
1,Beam,1
2,Truss,6
3,Cantilever,3
4,Tied Arch,5
5,Suspension,4
6,Cable,2


In [25]:
#One-Hot Encoder
#Though label encoding is straight but it has the disadvantage that the numeric values can be misinterpreted by algorithms as having some sort of hierarchy/order in them.
#This ordering issue is addressed in another common alternative approach called ‘One-Hot Encoding’.
#In this strategy, each category value is converted into a new column and assigned a 1 or 0 (notation for true/false) value to the column

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(bridge_df[['Bridge_Types_Cat']]).toarray())

# merge with main df bridge_df on key values
bridge_df = bridge_df.join(enc_df)
bridge_df

Unnamed: 0,Bridge_Types,Bridge_Types_Cat,0,1,2,3,4,5,6
0,Arch,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Beam,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Truss,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Cantilever,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Tied Arch,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,Suspension,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Cable,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [26]:
#Using dummies values approach:
#This approach is more flexible because it allows encoding as many category columns as you would like and choose how to label the columns using a prefix.
#Proper naming will make the rest of the analysis just a little bit easier

import pandas as pd
import numpy as np
# creating initial dataframe
bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])

# generate binary values using get_dummies
dum_df = pd.get_dummies(bridge_df, columns=["Bridge_Types"], prefix=["Type_is"] )

# merge with main df bridge_df on key values
bridge_df = bridge_df.join(dum_df)
bridge_df

Unnamed: 0,Bridge_Types,Type_is_Arch,Type_is_Beam,Type_is_Cable,Type_is_Cantilever,Type_is_Suspension,Type_is_Tied Arch,Type_is_Truss
0,Arch,1,0,0,0,0,0,0
1,Beam,0,1,0,0,0,0,0
2,Truss,0,0,0,0,0,0,1
3,Cantilever,0,0,0,1,0,0,0
4,Tied Arch,0,0,0,0,0,1,0
5,Suspension,0,0,0,0,1,0,0
6,Cable,0,0,1,0,0,0,0
