# Importing Libraries

In [209]:
import pandas as pd
import numpy as np

# Fetching The Data

In [210]:
data = pd.read_csv("Data.csv")

In [211]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [212]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

In [213]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [214]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Encoding categorical data

### Encoding the Dependent Variable Using InBuilt Libraries

In [114]:
from sklearn.preprocessing import LabelEncoder

In [115]:
le = LabelEncoder()
le.fit(Y)
le.transform(Y)

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Encoding the Dependent Variable Using My Methods

In [129]:
class MyLabelEncoder():
    def __init__(self):
        pass
    def fit(self, data):
        self.unique = list(set(map(str.lower, data)))
        self.unique.sort()
    def transform(self, data):
        new_data = []
        for item in data:
            new_data.append(self.unique.index(item.lower()))
        return new_data
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [130]:
myle = MyLabelEncoder()

In [131]:
myle.fit(Y)

In [132]:
myle.transform(Y)

[0, 1, 0, 0, 1, 1, 0, 1, 0, 1]

In [133]:
myle.fit_transform(Y)

[0, 1, 0, 0, 1, 1, 0, 1, 0, 1]

### Encoding the InDependent Variable Using InBuilt Libraries

In [135]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [143]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder="passthrough")
ct.fit(X)
ct.transform(X) 

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, nan],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, nan, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the InDependent Variable Using My Methods

In [268]:
class MyOneHotEncoder():
    def __init__(self):
        pass
    def fit(self, data, column):
        self.unique_cols = [0] * len(data)
        for col in column:
            self.unique_cols[col] = list(set(map(str.lower, data[:, column[col]])))
            self.unique_cols[col].sort()
    def transform(self, data, column):
        dataset = []
        for row in data:
            row_dataset = []
            for col in column:
                for val in self.unique_cols[col]:
                    if row[col].lower() == val:
                        row_dataset.append(1.0)
                    else:
                        row_dataset.append(0.0)
                for each in range(len(row)):
                    if col != each:
                        row_dataset.append(row[each])
                dataset.append(row_dataset)
        return dataset

In [269]:
myohe = MyOneHotEncoder()

In [270]:
myohe.fit(X, [0])

In [271]:
myohe.transform(A, [0])

[[1.0, 0.0, 0.0, 44.0, 72000.0],
 [0.0, 0.0, 1.0, 27.0, 48000.0],
 [0.0, 1.0, 0.0, 30.0, 54000.0],
 [0.0, 0.0, 1.0, 38.0, 61000.0],
 [0.0, 1.0, 0.0, 40.0, nan],
 [1.0, 0.0, 0.0, 35.0, 58000.0],
 [0.0, 0.0, 1.0, nan, 52000.0],
 [1.0, 0.0, 0.0, 48.0, 79000.0],
 [0.0, 1.0, 0.0, 50.0, 83000.0],
 [1.0, 0.0, 0.0, 37.0, 67000.0]]