# Data Transformation - Ordinal and One Hot Encoding

## 0. Introduction

This notebook contains:
  1. Breast cancer dataset
  2. Ordinal Encoding using OrdinalEncoder
  3. One Hot Encoding using OneHotEncoder

## 1. Breast Cancer dataset

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
path = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
data = pd.read_csv(path, header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [19]:
data.shape

(286, 10)

In [18]:
data.isnull().sum()

0    0
1    0
2    0
3    0
4    8
5    0
6    0
7    1
8    0
9    0
dtype: int64

In [20]:
data.dropna(inplace=True)

In [21]:
data.shape

(277, 10)

## 1. Ordinal Encoding using OrdinalEncoder

In [22]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [23]:
y = LabelEncoder().fit_transform(y)
X = OrdinalEncoder().fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((221, 9), (56, 9), (221,), (56,))

In [29]:
model = LogisticRegression()

model.fit(X_train, y_train)
y_preds = model.predict(X_test)
accuracy = accuracy_score(y_test, y_preds)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.71


## 2. One-Hot Encoding using OneHotEncoder

In [30]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [31]:
y = LabelEncoder().fit_transform(y)
X = OneHotEncoder().fit_transform(X)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((221, 41), (56, 41), (221,), (56,))

In [33]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
accuracy = accuracy_score(y_test, y_preds)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73
