In [2]:
import pandas as pd

In [43]:
import numpy as np

#### Ordinal encoding

Ordinal categorical data is one in which there is a class for e.g education level, in which we know PG>UG>high school so we encode them into numerical entries accordingly!

In [3]:
df = pd.read_csv(r"C:\Users\SIDDHARTH\OneDrive\Documents\JOB PROFILES\Data Science\Machine Learning\data files\customer.csv")

In [4]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
49,25,Female,Good,UG,No
7,60,Female,Poor,School,Yes
17,22,Female,Poor,UG,Yes
48,39,Female,Good,UG,Yes
14,15,Male,Poor,PG,Yes


gender is nominal categorical data (will be defined in next section), so we will consider the ordinal data for now i.e review and education 

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,2:4],df.iloc[:,-1], test_size=0.2)

In [11]:
x_train.sample()

Unnamed: 0,review,education
48,Good,UG


In [12]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

these categories are used to tell the encoder what is the heirarchy level of the categories, if you dont pass this last; encoder will select the heirarchy at random

In [13]:
oe.fit(x_train)

In [14]:
x_train = oe.transform(x_train)
x_test = oe.transform(x_test)

#### Label encoder

this is explcity used for output categorical data, and shall never be used for inputs

In [17]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

In [18]:
le.fit(y_train)

In [19]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [20]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

#### NOminal encoding/ one hot encoding

nominal categorical data is one where there is no heirarchy for example; male/female.

In [21]:
df2 = pd.read_csv(r"C:\Users\SIDDHARTH\OneDrive\Documents\JOB PROFILES\Data Science\Machine Learning\data files\cars.csv")

In [23]:
df2.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [24]:
df2['brand'].nunique()

32

In [25]:
df2['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [26]:
df2['fuel'].nunique()

4

#### One hot encoding using Pandas

In [28]:
pd.get_dummies(df2, columns=['fuel','owner'],dtype= int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


solving multicollinearity problem, i,e removing one column from one hot encoding to keep the columns independent of each other as in ML models (specially for linera models) all the columns/dimensions should be independent of each other

#### K-1 ONEHOTENCODING

In [29]:
pd.get_dummies(df2,columns=['fuel','owner'], drop_first= True, dtype= int)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


#### ONEHOT ENCODING using sklearn (recommended)

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df2.iloc[:,0:4],df2.iloc[:,-1], test_size=0.2)

In [38]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first')

In [40]:
x_train_new = ohe.fit_transform(x_train[['fuel','owner']]).toarray()

In [41]:
x_test_new = ohe.transform(x_test[['fuel','owner']]).toarray()

In [44]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new))

array([['Mahindra', 110000, 1.0, ..., 1.0, 0.0, 0.0],
       ['Toyota', 129000, 1.0, ..., 0.0, 0.0, 1.0],
       ['Tata', 50000, 1.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Hyundai', 100000, 0.0, ..., 0.0, 0.0, 1.0],
       ['Mahindra', 120000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 110000, 0.0, ..., 1.0, 0.0, 0.0]], dtype=object)

#### ONe hot encoding with top categories

i.e in brand there is a lot of brands but few of them as very minimal entries so what we will do it make the categories for major brands and add minimals ones to a new category as "others"

In [45]:
threshold = 100

In [47]:
counts = df2['brand'].value_counts()

In [48]:
repl = counts[counts <= threshold].index

In [51]:
pd.get_dummies(df2['brand'].replace(repl, 'uncommon'), dtype= int)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


#### COlumn transformer

THere is a problem while encoding, i.e we have to handle each columns seperatelly and sometimes join them in the dataset seperatelly, which is very time consuming. This is where column transformer comes in handy,

In [55]:
df3 = pd.read_csv(r"C:\Users\SIDDHARTH\OneDrive\Documents\JOB PROFILES\Data Science\Machine Learning\data files\covid_toy.csv")

In [56]:
from sklearn.compose import ColumnTransformer

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [58]:
df3.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [59]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df3.drop(columns=['has_covid']),df3['has_covid'], test_size= 0.2)

In [66]:
transformer = ColumnTransformer(transformers = [
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]), ['cough']),
    ('tnf3',OneHotEncoder(sparse = False, drop='first'),['gender','city'])
], remainder='passthrough')

the basis representation of transofrmer is that first of all a list of tuple is sent to the columntransformer, which structure is ('name of transformer',Encoder/transformations function used, column names)
this list is followed by a command, remainder which tells either to drop or keep the columns(passthrough) not being transformed in the dateset or not 

In [67]:
transformer.fit_transform(x_train)



array([[100.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  13.        ],
       [104.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  12.        ],
       [100.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  27.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  16.        ],
       [101.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  15.        ],
       [101.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  34.        ],
       [ 99.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  60.        ],
       [103.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  83.        ],
       [101.        ,   1.        ,   1.        ,   0.        ,
          0.    

In [68]:
transformer.transform(x_test)

array([[101.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  81.        ],
       [101.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  68.        ],
       [104.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  42.        ],
       [100.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  55.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  64.        ],
       [103.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  48.        ],
       [100.91891892,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  38.        ],
       [ 98.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  71.        ],
       [100.91891892,   0.        ,   1.        ,   0.        ,
          1.    