# Sales Encoding

In [17]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

Select the Imput File

In [18]:
input_file = "../data/sales.csv"

Create a DataFrame using an ifered schema 

In [19]:
df = pd.read_csv(input_file)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   division            1000 non-null   object
 1   level of education  1000 non-null   object
 2   training level      1000 non-null   int64 
 3   work experience     1000 non-null   int64 
 4   salary              1000 non-null   int64 
 5   sales               1000 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 47.0+ KB
None


## Plot the data

In [20]:
df

Unnamed: 0,division,level of education,training level,work experience,salary,sales
0,computer software,associate's degree,2,13,139453,593170
1,peripherals,high school,0,10,99080,390508
2,office supplies,associate's degree,0,12,104289,419848
3,computer hardware,associate's degree,1,3,76513,196296
4,peripherals,high school,2,5,89442,433124
...,...,...,...,...,...,...
995,peripherals,some college,0,5,75142,219358
996,office supplies,some college,1,9,95290,360403
997,computer hardware,some college,1,7,93239,355771
998,computer hardware,some college,3,6,104913,497290


## Encode division (Label)

In [21]:
div_encoder = LabelEncoder()
df["division_le"] = div_encoder.fit_transform(df["division"])
df

Unnamed: 0,division,level of education,training level,work experience,salary,sales,division_le
0,computer software,associate's degree,2,13,139453,593170,1
1,peripherals,high school,0,10,99080,390508,3
2,office supplies,associate's degree,0,12,104289,419848,2
3,computer hardware,associate's degree,1,3,76513,196296,0
4,peripherals,high school,2,5,89442,433124,3
...,...,...,...,...,...,...,...
995,peripherals,some college,0,5,75142,219358,3
996,office supplies,some college,1,9,95290,360403,2
997,computer hardware,some college,1,7,93239,355771,0
998,computer hardware,some college,3,6,104913,497290,0


## Encode division (OneHot)

In [22]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto',sparse_output=False), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
).set_output(transform="pandas")
ct.fit_transform(df)


Unnamed: 0,one_hot_encoder__division_computer hardware,one_hot_encoder__division_computer software,one_hot_encoder__division_office supplies,one_hot_encoder__division_peripherals,one_hot_encoder__division_printers,remainder__level of education,remainder__training level,remainder__work experience,remainder__salary,remainder__sales,remainder__division_le
0,0.0,1.0,0.0,0.0,0.0,associate's degree,2,13,139453,593170,1
1,0.0,0.0,0.0,1.0,0.0,high school,0,10,99080,390508,3
2,0.0,0.0,1.0,0.0,0.0,associate's degree,0,12,104289,419848,2
3,1.0,0.0,0.0,0.0,0.0,associate's degree,1,3,76513,196296,0
4,0.0,0.0,0.0,1.0,0.0,high school,2,5,89442,433124,3
...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,0.0,some college,0,5,75142,219358,3
996,0.0,0.0,1.0,0.0,0.0,some college,1,9,95290,360403,2
997,1.0,0.0,0.0,0.0,0.0,some college,1,7,93239,355771,0
998,1.0,0.0,0.0,0.0,0.0,some college,3,6,104913,497290,0


In [23]:
one_hot = OneHotEncoder(categories=[df["division"].unique().tolist()],sparse_output=False).set_output(transform='pandas')
df.join(one_hot.fit_transform(pd.DataFrame(df["division"])))
df

Unnamed: 0,division,level of education,training level,work experience,salary,sales,division_le
0,computer software,associate's degree,2,13,139453,593170,1
1,peripherals,high school,0,10,99080,390508,3
2,office supplies,associate's degree,0,12,104289,419848,2
3,computer hardware,associate's degree,1,3,76513,196296,0
4,peripherals,high school,2,5,89442,433124,3
...,...,...,...,...,...,...,...
995,peripherals,some college,0,5,75142,219358,3
996,office supplies,some college,1,9,95290,360403,2
997,computer hardware,some college,1,7,93239,355771,0
998,computer hardware,some college,3,6,104913,497290,0


## Ordinal Encoding of level of education

In [24]:
edu_encoder = OrdinalEncoder(categories=[["high school","some college", "associate's degree", "bachelor's degree", "master's degree"]]).set_output(transform="pandas")
df["edu_enc"] = edu_encoder.fit_transform(pd.DataFrame(df["level of education"]))    
df

Unnamed: 0,division,level of education,training level,work experience,salary,sales,division_le,edu_enc
0,computer software,associate's degree,2,13,139453,593170,1,2.0
1,peripherals,high school,0,10,99080,390508,3,0.0
2,office supplies,associate's degree,0,12,104289,419848,2,2.0
3,computer hardware,associate's degree,1,3,76513,196296,0,2.0
4,peripherals,high school,2,5,89442,433124,3,0.0
...,...,...,...,...,...,...,...,...
995,peripherals,some college,0,5,75142,219358,3,1.0
996,office supplies,some college,1,9,95290,360403,2,1.0
997,computer hardware,some college,1,7,93239,355771,0,1.0
998,computer hardware,some college,3,6,104913,497290,0,1.0
