In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer



In [2]:
df = pd.read_csv("data/data.csv")
df.head()

Unnamed: 0,Id,Colour,Country
0,1,Red,USA
1,2,Blue,UK
2,3,Green,Canada
3,4,Blue,USA
4,5,Blue,USA


In [3]:
df.shape

(300, 3)

In [4]:
# only for encoding labels and don't use with features
labelencoder = LabelEncoder()
labelencoder.fit_transform(df["Colour"])

array([2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0,
       1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0,
       0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2,
       1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1,
       2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2,
       2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0,
       1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0,
       0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2,
       1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1,
       2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2,
       2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0,
       1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0,
       0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 0, 2,
       1, 1, 2, 2, 2, 0, 1, 0, 0, 2, 1, 1, 2, 2])

In [5]:
df.dtypes

Id          int64
Colour     object
Country    object
dtype: object

In [6]:
df["Country"].unique()

array(['USA', 'UK', 'Canada'], dtype=object)

In [7]:
onehotencoder = OneHotEncoder()  ##can use handle_unknown="ignore" so it won't error if we have new columns later in new data 
feature_array = onehotencoder.fit_transform(df[["Colour","Country"]]).toarray()
feature_array

array([[0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.]])

In [8]:
feature_lables = onehotencoder.categories_
feature_lables

[array(['Blue', 'Green', 'Red'], dtype=object),
 array(['Canada', 'UK', 'USA'], dtype=object)]

In [9]:
feature_lables = np.array(feature_lables).ravel()
feature_lables

array(['Blue', 'Green', 'Red', 'Canada', 'UK', 'USA'], dtype=object)

In [10]:
features = pd.DataFrame(feature_array,columns=feature_lables)
features

Unnamed: 0,Blue,Green,Red,Canada,UK,USA
0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
295,0.0,0.0,1.0,1.0,0.0,0.0
296,0.0,1.0,0.0,0.0,1.0,0.0
297,0.0,1.0,0.0,0.0,1.0,0.0
298,0.0,0.0,1.0,1.0,0.0,0.0


In [11]:
df.Colour.unique(),df.Country.unique(),df.shape

(array(['Red', 'Blue', 'Green'], dtype=object),
 array(['USA', 'UK', 'Canada'], dtype=object),
 (300, 3))

In [12]:
features.shape

(300, 6)

In [13]:
pd.concat([df,features],axis=1)

Unnamed: 0,Id,Colour,Country,Blue,Green,Red,Canada,UK,USA
0,1,Red,USA,0.0,0.0,1.0,0.0,0.0,1.0
1,2,Blue,UK,1.0,0.0,0.0,0.0,1.0,0.0
2,3,Green,Canada,0.0,1.0,0.0,1.0,0.0,0.0
3,4,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
4,5,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
295,296,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0
296,297,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
297,298,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
298,299,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0


In [14]:
df

Unnamed: 0,Id,Colour,Country
0,1,Red,USA
1,2,Blue,UK
2,3,Green,Canada
3,4,Blue,USA
4,5,Blue,USA
...,...,...,...
295,296,Red,Canada
296,297,Green,UK
297,298,Green,UK
298,299,Red,Canada


In [15]:
df.shape

(300, 3)

In [16]:
ohe = OneHotEncoder()

In [17]:
###transforming columns to this state-> onehotencoded(colour)-id-country
ct = make_column_transformer((ohe,["Colour"]),remainder="passthrough")   ##remainder = "drop"
ct.fit_transform(df)

array([[0.0, 0.0, 1.0, 1, 'USA'],
       [1.0, 0.0, 0.0, 2, 'UK'],
       [0.0, 1.0, 0.0, 3, 'Canada'],
       ...,
       [0.0, 1.0, 0.0, 298, 'UK'],
       [0.0, 0.0, 1.0, 299, 'Canada'],
       [0.0, 0.0, 1.0, 300, 'Canada']], dtype=object)

In [18]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([("colour_transformed",ohe, [1]),("name","passthrough",[0,2])])
ct.fit_transform(df)

array([[0.0, 0.0, 1.0, 1, 'USA'],
       [1.0, 0.0, 0.0, 2, 'UK'],
       [0.0, 1.0, 0.0, 3, 'Canada'],
       ...,
       [0.0, 1.0, 0.0, 298, 'UK'],
       [0.0, 0.0, 1.0, 299, 'Canada'],
       [0.0, 0.0, 1.0, 300, 'Canada']], dtype=object)

In [19]:
df.shape

(300, 3)

In [20]:
features = ct.fit_transform(df)
features.shape

(300, 5)