### Handling Categorial Data 

- For Nominal Data - One hot Encoder

- For Ordinal Data - Ordinal Encoder

- Label Encoding - Only Use for o/p values. 



In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("D:\\Sandesh\\100-days-of-machine-learning\\day26-ordinal-encoding\\customer.csv")
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [3]:
df =df.iloc[:,2:]
df.head()


Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [4]:
df.dtypes

review       object
education    object
purchased    object
dtype: object

In [5]:
X=df.iloc[:,0:2]
Y=df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=1)

### Ordinal Data - Ordinal Encoding 

In [6]:
from sklearn.preprocessing import OrdinalEncoder 

oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

oe.fit(X_train)

X_train= oe.transform(X_train)
X_test= oe.transform(X_test)

In [7]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

### Label Encoding 

In [8]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

le.fit(y_train)

X_train= le.transform(y_train)
X_test= le.transform(y_test)

le.classes_

array(['No', 'Yes'], dtype=object)

### One-Hot Encoding 

In [9]:
df=pd.read_csv("D:\\Sandesh\\100-days-of-machine-learning\\day27-one-hot-encoding\\cars.csv")
df.head()        

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [10]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [11]:
df['brand'].nunique()

32

### 1.OneHotEncoding Using Pandas 

* problem of using pandas onehotencoding is that it does not remember the sequence every time.


In [12]:
pd.get_dummies(df,columns=['fuel','owner'])


Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### 2. K-1 OneHotEncoding 

In [13]:
# Drop the first columns

pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)


Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### 3.OneHotEncoding using Sklearn 

In [14]:

# To solve the problem of pandas we go with sklearn every time.

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
X=df.iloc[:,0:4]
Y=df.iloc[:,-1]
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=1)


In [15]:
ohe=OneHotEncoder(drop='first')

# o/p of ohe is in form of sparse matrix , hence to convert that into
# numpy array we use .toarray()
# else we can use sparse = False , then we dont have to do anything.

X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()


In [16]:
ohe=OneHotEncoder(drop='first',sparse=False)
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])


In [17]:

# How add brand , km_driven we can use hstack

np.hstack((X_train[['brand','km_driven']].values,X_train_new))


array([['Honda', 100000, 1.0, ..., 1.0, 0.0, 0.0],
       ['Maruti', 120000, 1.0, ..., 1.0, 0.0, 0.0],
       ['Renault', 100000, 1.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['Hyundai', 70000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 110000, 1.0, ..., 1.0, 0.0, 0.0],
       ['Maruti', 65755, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

### 4.OneHotEncoding with many categories 

In [28]:
df['brand'].nunique()


32

In [29]:
counts = df['brand'].value_counts()
threshold=100

rpl = counts[counts<=threshold].index
rpl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [25]:
pd.get_dummies(df['brand'].replace(rpl,"uncommon"))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


### THE END 