In [1]:
#encoding and its types
#encoding is the process of converting categorical or textual data into a numerical format so that a computer can understand and process it.
#Types of Encoding:

In [18]:
# 1.Label Encoding:
# It is a technique in Machine Learning used to convert categorical values into numerical values by assigning each unique category a unique integer.
#Usage: It is designed to work on one column at a time

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

dataset={
        "id":[1,2,3,4,5],
        "color":["red","yellow","orange","blue","green"],
        "size":["medium","small","large","extra_large","extra_small"],
        "price":[150,200,250,300,350]
}
df=pd.DataFrame(dataset)

In [20]:
print("before encoding:")
print(df)

before encoding:
   id   color         size  price
0   1     red       medium    150
1   2  yellow        small    200
2   3  orange        large    250
3   4    blue  extra_large    300
4   5   green  extra_small    350


In [22]:
# Create LabelEncoder objects
color_encoder=LabelEncoder()
size_encoder=LabelEncoder()

#Apply label encoding
df["color"]=color_encoder.fit_transform(df["color"])
df["size"]=size_encoder.fit_transform(df["size"])
print("after label encoding:")
print(df)

after label encoding:
   id  color  size  price
0   1      3     3    150
1   2      4     4    200
2   3      2     2    250
3   4      0     0    300
4   5      1     1    350


In [40]:
# 2.Ordinal Encoding
# OrdinalEncoder is an encoding technique in ML that converts multiple categorical columns into unique integers in one go, 
# and is especially useful when the categories have an inherent order

In [42]:
import numpy as np
import pandas as pd
df1=pd.read_csv("covid_toy.csv")

In [92]:
print("before encoding:")
df1.head(5)

before encoding:


Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,0.0,103.0,0.0,0.0,0.0
1,27,0.0,100.0,0.0,1.0,1.0
2,42,0.0,101.0,0.0,1.0,0.0
3,31,1.0,98.0,0.0,0.0,0.0
4,65,1.0,101.0,0.0,2.0,0.0


In [50]:
df1=df1.dropna()

In [52]:
from sklearn.preprocessing import OrdinalEncoder

In [76]:
oe=OrdinalEncoder(categories=[["Male","Female"],
                            ["Mild","Strong"],
                              ["Kolkata","Delhi","Mumbai","Bangalore"],
                              ["No","Yes"]
                             ])

#categories passed should be in sequential format

In [78]:
categorical_columns=["gender","cough","city","has_covid"]

In [84]:
df1[categorical_columns]=oe.fit_transform(df1[categorical_columns])

In [90]:
print("after encoding:")
print(df1)

after encoding:
    age  gender  fever  cough  city  has_covid
0    60     0.0  103.0    0.0   0.0        0.0
1    27     0.0  100.0    0.0   1.0        1.0
2    42     0.0  101.0    0.0   1.0        0.0
3    31     1.0   98.0    0.0   0.0        0.0
4    65     1.0  101.0    0.0   2.0        0.0
..  ...     ...    ...    ...   ...        ...
95   12     1.0  104.0    0.0   3.0        0.0
96   51     1.0  101.0    1.0   0.0        1.0
97   20     1.0  101.0    0.0   3.0        0.0
98    5     1.0   98.0    1.0   2.0        0.0
99   10     1.0   98.0    1.0   0.0        1.0

[90 rows x 6 columns]


In [94]:
# 3.OneHot Encoding:
# One-Hot Encoding is a method to convert categorical data into a binary (0/1) format where:
# Each unique category in a column becomes its own new column.

#Due to creation of so many columns and training datasets is large and overfittibg happens here so, for that purpose we use drop="first" to reduce,
# 1 column from each category


In [126]:
import numpy as np
import pandas as pd

In [128]:
dfhot=pd.read_csv("covid_toy.csv")

In [130]:
print("before encoding:")
print(dfhot.head(5))

before encoding:
   age  gender  fever cough     city has_covid
0   60    Male  103.0  Mild  Kolkata        No
1   27    Male  100.0  Mild    Delhi       Yes
2   42    Male  101.0  Mild    Delhi        No
3   31  Female   98.0  Mild  Kolkata        No
4   65  Female  101.0  Mild   Mumbai        No


In [132]:
from sklearn.preprocessing import OneHotEncoder

In [134]:
ohe=OneHotEncoder(drop="first",sparse_output=False,dtype=np.int32)

In [140]:
ohe.fit_transform(dfhot[["gender","cough","city","has_covid"]])
#if drop="first" was not mentioned, the result would have been (90,10) but now its (90,6) reducing chances of overfitting

array([[1, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 1],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0],
       [1, 1, 0, 1, 0, 1],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 1],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 1],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
 