# Dealing With Categorical Values

## Importing Libraries

In [116]:
import pandas as pd
import numpy as np

## Importing Dataset

In [117]:
df = pd.read_csv("Salary_Dataset.csv")

In [118]:
df.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [119]:
# Get dummies with pandas in any variable
country_dummy = pd.get_dummies(df['country'])
country_dummy


Unnamed: 0,Australia,Canada,Dubai,USA
0,False,False,True,False
1,False,True,False,False
2,False,True,False,False
3,False,True,False,False
4,False,False,False,True
5,False,False,True,False
6,False,True,False,False
7,True,False,False,False
8,False,False,True,False
9,False,False,True,False


In [120]:
# concatenate the dataframes into original dataframes.
df=pd.concat([df,country_dummy],axis=1)
df

Unnamed: 0,country,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,Dubai,39343.0,1.1,No,False,False,True,False
1,Canada,46205.0,1.3,Yes,False,True,False,False
2,Canada,37731.0,1.5,No,False,True,False,False
3,Canada,43525.0,2.0,No,False,True,False,False
4,USA,39891.0,2.2,No,False,False,False,True
5,Dubai,56642.0,2.9,No,False,False,True,False
6,Canada,60150.0,3.0,Yes,False,True,False,False
7,Australia,54445.0,3.2,No,True,False,False,False
8,Dubai,64445.0,3.2,Yes,False,False,True,False
9,Dubai,57189.0,3.7,No,False,False,True,False


In [121]:
# Drop the country column and rearrange the index of column.
df.drop(['country'],axis=1,inplace=True)
df

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,No,False,False,True,False
1,46205.0,1.3,Yes,False,True,False,False
2,37731.0,1.5,No,False,True,False,False
3,43525.0,2.0,No,False,True,False,False
4,39891.0,2.2,No,False,False,False,True
5,56642.0,2.9,No,False,False,True,False
6,60150.0,3.0,Yes,False,True,False,False
7,54445.0,3.2,No,True,False,False,False
8,64445.0,3.2,Yes,False,False,True,False
9,57189.0,3.7,No,False,False,True,False


In [122]:
df = df[['Australia', 'Canada', 'Dubai', 'USA','Salary','YearsExperience','Purchased']]
df

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,False,False,True,False,39343.0,1.1,No
1,False,True,False,False,46205.0,1.3,Yes
2,False,True,False,False,37731.0,1.5,No
3,False,True,False,False,43525.0,2.0,No
4,False,False,False,True,39891.0,2.2,No
5,False,False,True,False,56642.0,2.9,No
6,False,True,False,False,60150.0,3.0,Yes
7,True,False,False,False,54445.0,3.2,No
8,False,False,True,False,64445.0,3.2,Yes
9,False,False,True,False,57189.0,3.7,No


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [123]:
# Lets first perform label encoding
# Labelencoding will be performed on Purchased column as it has only two unique value in it.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Purchased']=le.fit_transform(df['Purchased'])
df


Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,False,False,True,False,39343.0,1.1,0
1,False,True,False,False,46205.0,1.3,1
2,False,True,False,False,37731.0,1.5,0
3,False,True,False,False,43525.0,2.0,0
4,False,False,False,True,39891.0,2.2,0
5,False,False,True,False,56642.0,2.9,0
6,False,True,False,False,60150.0,3.0,1
7,True,False,False,False,54445.0,3.2,0
8,False,False,True,False,64445.0,3.2,1
9,False,False,True,False,57189.0,3.7,0


In [124]:
df = pd.read_csv("Salary_Dataset.csv")

In [125]:
# le=LabelEncoder()
# for column in ['Australia', 'Canada', 'Dubai', 'USA']:
#     df[column] = le.fit_transform(df[column])
# df
le = LabelEncoder()
df['Purchased'] = le.fit_transform(df['Purchased'])


In [126]:

# Lets perform one-hot encoding on country column.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')

df=pd.DataFrame(ct.fit_transform(df))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,1.0,0.0,39343.0,1.1,0.0
1,0.0,1.0,0.0,0.0,46205.0,1.3,1.0
2,0.0,1.0,0.0,0.0,37731.0,1.5,0.0
3,0.0,1.0,0.0,0.0,43525.0,2.0,0.0
4,0.0,0.0,0.0,1.0,39891.0,2.2,0.0
5,0.0,0.0,1.0,0.0,56642.0,2.9,0.0
6,0.0,1.0,0.0,0.0,60150.0,3.0,1.0
7,1.0,0.0,0.0,0.0,54445.0,3.2,0.0
8,0.0,0.0,1.0,0.0,64445.0,3.2,1.0
9,0.0,0.0,1.0,0.0,57189.0,3.7,0.0


In [127]:
df.columns=['Australia', 'Canada', 'Dubai', 'USA','Salary','YearsExperience','Purchased']
df.head(10)

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,0.0,0.0,1.0,0.0,39343.0,1.1,0.0
1,0.0,1.0,0.0,0.0,46205.0,1.3,1.0
2,0.0,1.0,0.0,0.0,37731.0,1.5,0.0
3,0.0,1.0,0.0,0.0,43525.0,2.0,0.0
4,0.0,0.0,0.0,1.0,39891.0,2.2,0.0
5,0.0,0.0,1.0,0.0,56642.0,2.9,0.0
6,0.0,1.0,0.0,0.0,60150.0,3.0,1.0
7,1.0,0.0,0.0,0.0,54445.0,3.2,0.0
8,0.0,0.0,1.0,0.0,64445.0,3.2,1.0
9,0.0,0.0,1.0,0.0,57189.0,3.7,0.0


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Australia        30 non-null     float64
 1   Canada           30 non-null     float64
 2   Dubai            30 non-null     float64
 3   USA              30 non-null     float64
 4   Salary           30 non-null     float64
 5   YearsExperience  30 non-null     float64
 6   Purchased        30 non-null     float64
dtypes: float64(7)
memory usage: 1.8 KB
