In [64]:
import pandas as pd
import numpy as np

In [65]:
df = pd.read_csv('..\\Datasets\\Cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1933,Maruti,132000,Petrol,Second Owner,135000
4814,Honda,50000,Diesel,Second Owner,1100000
3842,Toyota,25538,Petrol,First Owner,625000
7742,Hyundai,20000,Petrol,First Owner,275000
1144,Maruti,5621,Petrol,First Owner,650000


brand is nominal categorical column

In [66]:
# print(df['brand'].unique())
# print(df['brand'].nunique())
print(df['fuel'].unique())
print(df['fuel'].nunique())
print(df['owner'].unique())
print(df['owner'].nunique())
# print(df['brand'].value_counts())

['Diesel' 'Petrol' 'LPG' 'CNG']
4
['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
5


In [67]:
print(df['fuel'].value_counts())
print(df['owner'].value_counts())

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64
owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64


# 1. OneHotEncoding using PANDAS

- The function pd.get_dummies() function creates separate binary columns for each unique value in clomuns 'fuel' and 'owner'.
- The original categorical columns are replaced with these binary-encoded columns

**LIMITATIONS OF pd.get_dummies() FROM PANDAS** <br>
- it applies transformation on the entire dataset in one go
- it creates as many columns as there are categories even for sparse categories
- it cannot be integrated with Sklearn's PIPELINE

In [68]:
pd.get_dummies(df, columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K-1 OneHotEncoding

In [69]:
pd.get_dummies(df, columns=['fuel','owner'], drop_first=True)


Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. OneHotEncoding using SKLEARN

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1], test_size= 0.2 , random_state = 0)

In [71]:
X_train.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner
4693,Mahindra,90000,Diesel,Second Owner
973,Hyundai,120000,Diesel,First Owner
6956,Hyundai,25000,Petrol,First Owner
7115,Tata,120000,Diesel,Second Owner
1105,Toyota,79328,Diesel,Second Owner


- Applying OHE to brand and fuel
- We need to extract brand and fuel, apply OHE, then again combine all of the columns to get X_train_encoded

In [72]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first')

In [73]:
ohe.fit(X_train[['fuel','owner']])
X_train_encoded=ohe.transform(X_train[['fuel','owner']]).toarray()
X_test_encoded=ohe.transform(X_test[['fuel','owner']]).toarray()

In [74]:
X_train_encoded.shape

(6502, 7)

In [75]:
X_train[['brand','km_driven']].values

array([['Hyundai', 60000],
       ['Tata', 150000],
       ['Hyundai', 110000],
       ...,
       ['Hyundai', 90000],
       ['Volkswagen', 90000],
       ['Hyundai', 110000]], dtype=object)

In [76]:
np.hstack((X_train[['brand','km_driven']].values, X_train_encoded))

array([['Hyundai', 60000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Tata', 150000, 1.0, ..., 0.0, 0.0, 1.0],
       ['Hyundai', 110000, 1.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['Hyundai', 90000, 0.0, ..., 1.0, 0.0, 0.0],
       ['Volkswagen', 90000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 110000, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

# 4. OneHotEncoding with TOP CATEGORIES

In [78]:
counts = df['brand'].value_counts()
print(counts)

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64


In [81]:
print(df['brand'].nunique())
threshold=100

32


In [84]:
repl = counts[counts <= threshold].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [86]:
pd.get_dummies(df['brand'].replace(repl,'others'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
