#  Encoding
In machine learning, encoding refers to the process of converting categorical data into a numerical representation that can be easily processed by machine learning models.
- Enable processing of categorical data: Machine learning models typically work with numerical data, so categorical data must be encoded into a numerical representation before it can be processed by the model.
- Improve model performance: Encoding categorical data can improve the performance of machine learning models. For example, one-hot encoding can help prevent the model from assuming a false order or relationship between categories, while ordinal encoding can help preserve the relationship between categories. Choosing the appropriate encoding method can make a significant difference in the accuracy of a machine learning model.

- Overall, encoding categorical data is an important step in machine learning that helps enable processing of categorical data, preserves information, avoids bias, and improves model performance.


## Types of Encoding

  -  Without Use Any Encoding Techniques
  -  Label Encoding 
  -  One-Hot Encoding
  -  Ordinal Encoding


In [1]:
# import all packegs 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load Dataset
data = pd.read_csv('supershops.csv')

In [3]:
# show data
data.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Marketing Spend  50 non-null     float64
 1   Administration   50 non-null     float64
 2   Transport        49 non-null     float64
 3   Area             50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
#check null values
data.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [6]:
#handel missing values with mean value
missing = data.Transport.mean()

In [7]:
#fillna
data.Transport = data.Transport.fillna(missing)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Marketing Spend  50 non-null     float64
 1   Administration   50 non-null     float64
 2   Transport        50 non-null     float64
 3   Area             50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
#Check missing valus after fill values with mean
data.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [10]:
data2 = data.copy()
data3 = data.copy()
data4 = data.copy()
data5 = data.copy()
data6 = data.copy()
data7 = data.copy()
data8 = data.copy()
data9 = data.copy()
data10 = data.copy()


# LabelEncoder

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
data2.Area = le.fit_transform(data2[['Area']])

In [13]:
data2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# using loop

In [14]:
for col in data3.columns:
    if data3[col].dtype == np.number:
        continue
    else:
        data3[col] = le.fit_transform(data3[col])

In [15]:
data3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [16]:
data4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# One hot encoding

In [17]:
data4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [18]:
dummy = pd.get_dummies(data4['Area'],drop_first=True)
dummy.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [19]:
data4 = pd.concat([data4, dummy], axis=1)

In [20]:
data4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,1,0
1,162597.7,151377.59,443898.53,Ctg,191792.06,0,0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0,1
3,144372.41,118671.85,383199.62,Dhaka,182901.99,1,0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0,1


In [21]:
#loop
for col in data5.columns:
    if data5[col].dtype == np.number:
        continue
    one = pd.get_dummies(data5[col], drop_first=True,prefix='Area')
    data5.drop(data5[[col]], axis=1, inplace=True)
    data5 = pd.concat([data5, one], axis=1)

In [22]:
data5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


# Ordinal Encoder

In [23]:
data6.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [24]:
from sklearn.preprocessing import OrdinalEncoder

In [25]:
order = data6.Area.values

In [26]:
order

array(['Dhaka', 'Ctg', 'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg', 'Rangpur', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Dhaka', 'Rangpur', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Dhaka',
       'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg'],
      dtype=object)

In [27]:
order = data6.Area.unique() # order = ['Dhaka','Rangpur','Ctg']

In [28]:
order

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [29]:
ordinal = OrdinalEncoder(categories=[order])

In [30]:
ordinal

OrdinalEncoder(categories=[array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)])

In [31]:
encoded = ordinal.fit_transform(data6[['Area']])

In [32]:
encoded

array([[0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [2.],
       [1.],
       [2.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [0.],
       [1.],
       [1.],
       [2.],
       [1.],
       [0.],
       [1.],
       [0.],
       [2.],
       [1.],
       [0.],
       [1.]])

In [33]:
data6.Area = encoded

In [34]:
data6.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


# Without Use Any Encoding Techniques


In [35]:
data7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [36]:
data7.Area = data7.Area.replace(['Dhaka','Ctg','Rangpur'], [1,2,3])

In [37]:
data7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,3,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,3,166187.94
