In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('datasets/cars.csv')

In [3]:
data = df.copy()

In [4]:
df.head(3)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000


In [5]:
df.shape

(8128, 5)

# OneHotEncoding  using  pandas

In [6]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [7]:
pd.get_dummies(df,columns=['fuel','owner']).shape

(8128, 12)

# K-1 OneHotEncoding Using Pandas

In [8]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


# OneHotEncoding Using Sklearn

In [9]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.drop('selling_price',axis=1),df['selling_price'])

In [10]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6214,Volkswagen,80000,Petrol,Second Owner
1034,Maruti,90000,Petrol,First Owner
620,Maruti,30000,Petrol,First Owner
291,Hyundai,35000,Petrol,First Owner
4817,Jaguar,45000,Diesel,First Owner


In [11]:
x_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
4702,Toyota,120000,Diesel,Second Owner
841,Maruti,70000,Petrol,Second Owner
1601,Renault,34000,Petrol,First Owner
5459,Hyundai,200000,Diesel,Second Owner
6246,Renault,15000,Petrol,First Owner


In [12]:
x_test.shape

(2032, 4)

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
ohe = OneHotEncoder()

x_train_encode = ohe.fit_transform(x_train[['fuel','owner']]).toarray()
x_test_encode = ohe.transform(x_test[['fuel','owner']]).toarray()

In [15]:
x_test_encode.shape

(2032, 9)

In [16]:
x_train[['brand','km_driven']].values

array([['Volkswagen', 80000],
       ['Maruti', 90000],
       ['Maruti', 30000],
       ...,
       ['Hyundai', 23456],
       ['Honda', 127991],
       ['Maruti', 50000]], dtype=object)

In [17]:
x_train_final = np.hstack((x_train[['brand','km_driven']].values,x_train_encode))

In [18]:
x_test_final = np.hstack((x_test[['brand','km_driven']].values,x_test_encode))

In [19]:
x_test_final.shape

(2032, 11)

In [20]:
# sparse = False use the not use toarray()
ohe = OneHotEncoder(drop='first')

x_train_encode2 = ohe.fit_transform(x_train[['fuel','owner']]).toarray()
x_test_encode2 = ohe.transform(x_test[['fuel','owner']]).toarray()

In [21]:
x_train_encode2.shape

(6096, 7)

In [22]:
x_test_encode2.shape

(2032, 7)

In [23]:
x_train_final2 = np.hstack((x_train[['brand','km_driven']].values,x_train_encode2))

In [24]:
x_train_final2.shape

(6096, 9)

In [25]:
x_test_final2 = np.hstack((x_test[['brand','km_driven']].values,x_test_encode2))

In [26]:
x_test_final2.shape

(2032, 9)

# OneHotEncoding with top categories

In [27]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [28]:
counts = df['brand'].value_counts()

In [29]:
repl = counts[counts <= 100].index

In [30]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [32]:
brand = pd.get_dummies(df['brand'].replace(repl,'uncommon'))

In [34]:
brand.shape

(8128, 13)

In [36]:
len(counts.index)

32

In [39]:
df[df['fuel'] == 'Petrol']

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2,Honda,140000,Petrol,Third Owner,158000
4,Maruti,120000,Petrol,First Owner,130000
5,Hyundai,45000,Petrol,First Owner,440000
7,Maruti,5000,Petrol,Second Owner,45000
11,Maruti,100000,Petrol,Second Owner,92000
...,...,...,...,...,...
8118,Hyundai,25000,Petrol,First Owner,380000
8119,Maruti,80000,Petrol,First Owner,360000
8120,Hyundai,191000,Petrol,First Owner,120000
8121,Maruti,50000,Petrol,Second Owner,260000


In [40]:
counts[counts <= 100].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')