# Dealing With Categorical Values

## Importing Libraries

In [9]:
import pandas as pd
import numpy as np

## Importing Dataset

In [10]:
dataset = pd.read_csv("Salary_Dataset.csv")

In [11]:
dataset.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [13]:
country_dummy = pd.get_dummies(dataset['country'])

# Convert boolean values to integers (0 and 1)
country_dummy = country_dummy.astype(int)
country_dummy

Unnamed: 0,Australia,Canada,Dubai,USA
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,0,1
5,0,0,1,0
6,0,1,0,0
7,1,0,0,0
8,0,0,1,0
9,0,0,1,0


In [14]:
dataset = pd.concat([dataset, country_dummy],axis = 1)

In [15]:
dataset.drop('country',axis = 1, inplace = True)
dataset = dataset[['Australia', 'Canada','Dubai', 'USA','YearsExperience','Salary','Purchased']]

In [16]:
dataset

Unnamed: 0,Australia,Canada,Dubai,USA,YearsExperience,Salary,Purchased
0,0,0,1,0,1.1,39343.0,No
1,0,1,0,0,1.3,46205.0,Yes
2,0,1,0,0,1.5,37731.0,No
3,0,1,0,0,2.0,43525.0,No
4,0,0,0,1,2.2,39891.0,No
5,0,0,1,0,2.9,56642.0,No
6,0,1,0,0,3.0,60150.0,Yes
7,1,0,0,0,3.2,54445.0,No
8,0,0,1,0,3.2,64445.0,Yes
9,0,0,1,0,3.7,57189.0,No


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [25]:
dataset = pd.read_csv("Salary_Dataset.csv")

In [26]:
# Lets first perform label encoding on price column because it has only two unique values 'Yes' and 'No'

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Purchased'] = le.fit_transform(dataset['Purchased'])

In [27]:
# Lets perform one-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0])], remainder='passthrough')
df = pd.DataFrame(ct.fit_transform(dataset))

In [28]:
df.columns = ['Australia','Canada','Dubai','USA','Salary', 'YearsExperience', 'Purchased']
df.head(10)

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,0.0,0.0,1.0,0.0,39343.0,1.1,0.0
1,0.0,1.0,0.0,0.0,46205.0,1.3,1.0
2,0.0,1.0,0.0,0.0,37731.0,1.5,0.0
3,0.0,1.0,0.0,0.0,43525.0,2.0,0.0
4,0.0,0.0,0.0,1.0,39891.0,2.2,0.0
5,0.0,0.0,1.0,0.0,56642.0,2.9,0.0
6,0.0,1.0,0.0,0.0,60150.0,3.0,1.0
7,1.0,0.0,0.0,0.0,54445.0,3.2,0.0
8,0.0,0.0,1.0,0.0,64445.0,3.2,1.0
9,0.0,0.0,1.0,0.0,57189.0,3.7,0.0


In [29]:
#You can find the sorted list of column names of encoding variable by this method.
print(sorted(list(dataset['country'].unique())))
#To find the name of all the columns of dataset.


['Australia', 'Canada', 'Dubai', 'USA']
