In [1]:
# in this notebook, managing 'categorical' data is discussed.

In [2]:
# Among all preprocessing steps,  our focus here is
     #    Standardization, or mean removal and variance scaling
     #    Non-linear transformation
     #    Normalization
     #  + Encoding categorical features
     #    Discretization
     #  + Imputation of missing values
     #    Generating polynomial features
     #  + Custom transformers

In [3]:
from sklearn.preprocessing import OrdinalEncoder

In [19]:
enc = OrdinalEncoder()

In [20]:
X = [['Male', 1], ['Female', 3], ['Female', 2], ['Male', 2], ['Female', 0], ['Male', 20],['Male', 1], ['Male', 7], ['Male', 10]]
enc.fit(X)

In [21]:
enc.categories_

[array(['Female', 'Male'], dtype=object),
 array([0, 1, 2, 3, 7, 10, 20], dtype=object)]

In [22]:
# it is applied to all colmn. Be careful! (not only categoricals)
enc.fit_transform(X)

array([[1., 1.],
       [0., 3.],
       [0., 2.],
       [1., 2.],
       [0., 0.],
       [1., 6.],
       [1., 1.],
       [1., 4.],
       [1., 5.]])

In [26]:
enc.inverse_transform([[1, 2]])

array([['Male', 2]], dtype=object)

In [24]:
# as you can see it transforms everything. but look at the data above. 
# The second column has no category on it.

In [181]:
X

[['Male', 1],
 ['Female', 3],
 ['Female', 2],
 ['Male', 2],
 ['Female', 0],
 ['Male', 20],
 ['Male', 1],
 ['Male', 7],
 ['Male', 10]]

In [33]:
# to return back only categorical data. categocial columns are 
# of type 'object', when you do run dataframe.info().
# so just list all colms of type 'object'.
import pandas as pd
categorical_cols = pd.DataFrame(X).select_dtypes(include=["object"])


In [35]:
categorical_cols

Unnamed: 0,0
0,Male
1,Female
2,Female
3,Male
4,Female
5,Male
6,Male
7,Male
8,Male


In [38]:
# now, let's specify what column is categorical what is not.

In [39]:

pd.DataFrame(X, columns=['sex', 'score'])

Unnamed: 0,sex,score
0,Male,1
1,Female,3
2,Female,2
3,Male,2
4,Female,0
5,Male,20
6,Male,1
7,Male,7
8,Male,10


In [55]:
tips = pd.DataFrame(X, columns=['sex', 'score'])
categorical_columns = list(tips.select_dtypes(include='object').columns )
categorical_columns

['sex']

In [56]:
numeric_columns = list(tips.select_dtypes(exclude='object').columns)
numeric_columns

['score']

In [112]:
# So, is means that for each column we need different transformers and imputer
# we can do it with ColumnTransformer which does transoforming for different 
# columns.
# The input is a List of (<name>, <transformer>, <columns-names>) tuples 
# specifying the transformer objects to be applied to subsets of the data.

In [113]:
from sklearn.compose import ColumnTransformer

In [114]:
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

In [115]:
trans_all = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numeric', StandardScaler(), numeric_columns),
    
])
trans_all

In [64]:
trans_all.fit_transform(tips)

array([[ 0.        ,  1.        , -0.67597529],
       [ 1.        ,  0.        , -0.34712245],
       [ 1.        ,  0.        , -0.51154887],
       [ 0.        ,  1.        , -0.51154887],
       [ 1.        ,  0.        , -0.84040171],
       [ 0.        ,  1.        ,  2.44812672],
       [ 0.        ,  1.        , -0.67597529],
       [ 0.        ,  1.        ,  0.31058324],
       [ 0.        ,  1.        ,  0.8038625 ]])

In [65]:
tips

Unnamed: 0,sex,score
0,Male,1
1,Female,3
2,Female,2
3,Male,2
4,Female,0
5,Male,20
6,Male,1
7,Male,7
8,Male,10


In [71]:
# more than two scaler transformers in numeric colmns:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer

In [72]:


trans_all = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numeric', Pipeline([('scale',StandardScaler()),
                 ('normalizing', QuantileTransformer(output_distribution='normal')),
                ])
               , numeric_columns),
    
])
trans_all

In [73]:
trans_all.fit_transform (tips)



array([[ 0.        ,  1.        , -0.88714656],
       [ 1.        ,  0.        ,  0.31863936],
       [ 1.        ,  0.        , -0.15731068],
       [ 0.        ,  1.        , -0.15731068],
       [ 1.        ,  0.        , -5.19933758],
       [ 0.        ,  1.        ,  5.19933758],
       [ 0.        ,  1.        , -0.88714656],
       [ 0.        ,  1.        ,  0.67448975],
       [ 0.        ,  1.        ,  1.15034938]])

In [74]:
# let's do more fancy transformer:

In [75]:
numeric_transformer = Pipeline([
                                ('handlingmissingvalues',SimpleImputer()),
                                ('scale',StandardScaler()),
                                ('normalizing', QuantileTransformer(output_distribution='normal')),
                              ])

NameError: name 'SimpleImputer' is not defined

In [259]:
categorical_tranformer = Pipeline ([
                                ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
                                ('encoding', OneHotEncoder()),
                               ])

In [260]:
# more fancy tranformer
trans_all = ColumnTransformer([
                             ('categorical', categorical_tranformer, categorical_columns),
                             ('numeric', numeric_transformer, numeric_columns),
    
                            ])
trans_all

In [261]:
trans_all.fit_transform(tips)



array([[ 0.        ,  1.        , -0.88714656],
       [ 1.        ,  0.        ,  0.31863936],
       [ 1.        ,  0.        , -0.15731068],
       [ 0.        ,  1.        , -0.15731068],
       [ 1.        ,  0.        , -5.19933758],
       [ 0.        ,  1.        ,  5.19933758],
       [ 0.        ,  1.        , -0.88714656],
       [ 0.        ,  1.        ,  0.67448975],
       [ 0.        ,  1.        ,  1.15034938]])

In [77]:
# checking other encoders:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [80]:
def cat_encoder(algorithm=OrdinalEncoder()):
    numeric_transformer = Pipeline([
                                    ('handlingmissingvalues',SimpleImputer()),
                                    ('scale',StandardScaler()),
                                    ('normalizing', QuantileTransformer(output_distribution='normal')),
                                  ])

    categorical_tranformer = Pipeline ([
                                    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
                                    ('encoding', algorithm),
                                   ])

    trans_all = ColumnTransformer([
                                 ('categorical', categorical_tranformer, categorical_columns),
                                 ('numeric', numeric_transformer, numeric_columns),

                                ])

    return trans_all.fit_transform(tips)

In [81]:
cat_encoder(OrdinalEncoder())



array([[ 1.        , -0.88714656],
       [ 0.        ,  0.31863936],
       [ 0.        , -0.15731068],
       [ 1.        , -0.15731068],
       [ 0.        , -5.19933758],
       [ 1.        ,  5.19933758],
       [ 1.        , -0.88714656],
       [ 1.        ,  0.67448975],
       [ 1.        ,  1.15034938]])

In [82]:
cat_encoder(OneHotEncoder())



array([[ 0.        ,  1.        , -0.88714656],
       [ 1.        ,  0.        ,  0.31863936],
       [ 1.        ,  0.        , -0.15731068],
       [ 0.        ,  1.        , -0.15731068],
       [ 1.        ,  0.        , -5.19933758],
       [ 0.        ,  1.        ,  5.19933758],
       [ 0.        ,  1.        , -0.88714656],
       [ 0.        ,  1.        ,  0.67448975],
       [ 0.        ,  1.        ,  1.15034938]])

In [84]:

#  A bit difference is the idea behind. 
# OrdinalEncoder is for converting features, while LabelEncoder is for converting target variable.
# That's why OrdinalEncoder can fit data that has the shape of (n_samples, n_features) while 
# LabelEncoder can only fit data that has the shape of (n_samples,) 
# (though in the past one used LabelEncoder within the loop to handle what has been becoming the job of OrdinalEncoder now)

# So the following based on the code on top is wrong
# cat_encoder(LabelEncoder())

In [85]:
le = LabelEncoder()
le.fit(["d","s","d", "ss", "s"])

In [86]:
list(le.classes_)

['d', 's', 'ss']

In [87]:
le.fit_transform(["d","s","d", "ss", "s"])

array([0, 1, 0, 2, 1])

In [88]:
le.fit_transform(tips["sex"])

array([1, 0, 0, 1, 0, 1, 1, 1, 1])

# Imputer

In [76]:
from sklearn.impute import SimpleImputer

In [89]:
# Replace missing values using a descriptive statistic (e.g. mean, 
# median, or most frequent) along each column, or using a constant value.

In [91]:
import numpy as np
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [92]:
dic = {
         "sex":["male", "female", "female", "male", "male", "female"],
       "score":[1,1,10,4,2,np.nan]

}

In [93]:
df = pd.DataFrame(dic)
df

Unnamed: 0,sex,score
0,male,1.0
1,female,1.0
2,female,10.0
3,male,4.0
4,male,2.0
5,female,


In [99]:
# see the result of imputation:
imp_mean.fit_transform(df[["score"]])

array([[ 1. ],
       [ 1. ],
       [10. ],
       [ 4. ],
       [ 2. ],
       [ 3.6]])

# Ex.

In [100]:
# penguine problem data transfering

In [103]:
tips = pd.read_csv('probelm_solved/penguins.csv')

In [104]:
tips.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [105]:
tips = tips.drop(columns="Unnamed: 0")
tips.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [106]:
labels = tips["species"]
features = tips.drop(columns="species")

In [107]:
features.shape, labels.shape

((344, 7), (344,))

In [108]:
categorical_columns = list(features.select_dtypes(include='object').columns )
categorical_columns

['island', 'sex']

In [109]:
numeric_columns = list(features.select_dtypes(exclude='object').columns )
numeric_columns

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']

In [110]:
numeric_transformer = Pipeline([
                                ('handlingmissingvalues',SimpleImputer(missing_values=np.nan, strategy='mean')),
                                ('scale',StandardScaler()),
                                ('normalizing', QuantileTransformer(output_distribution='normal')),
                              ])

categorical_tranformer = Pipeline ([
                                ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
                                ('encoding', OrdinalEncoder()),
                               ])

trans_all = ColumnTransformer([
                                 ('categorical', categorical_tranformer, categorical_columns),
                                 ('numeric', numeric_transformer, numeric_columns),

                                ])

In [332]:
penguine_featu_transformed = trans_all.fit_transform(features)
pd.DataFrame(penguine_featu_transformed)



Unnamed: 0,0,1,2,3,4,5,6
0,2.0,1.0,-0.709307,0.695279,-1.678270,-0.370016,-5.199338
1,2.0,0.0,-0.653988,0.040205,-1.134560,-0.273776,-5.199338
2,2.0,0.0,-0.511031,0.292794,-0.180012,-1.356541,-5.199338
3,2.0,1.0,-0.029236,-0.080474,0.168882,0.161474,-5.199338
4,2.0,0.0,-1.245827,1.113937,-0.338888,-0.895982,-5.199338
...,...,...,...,...,...,...,...
339,1.0,1.0,2.376185,1.413753,0.331158,-0.032892,5.199338
340,1.0,0.0,-0.062158,0.346638,0.236028,-1.011132,5.199338
341,1.0,1.0,0.901454,0.401507,-0.338888,-0.335021,5.199338
342,1.0,1.0,1.294956,0.934872,0.490321,0.051178,5.199338


# CustomTransformer

In [116]:
from sklearn.preprocessing import FunctionTransformer

In [117]:
custom_transformer = FunctionTransformer(np.log)

In [119]:
x = [1,2,3,3,4]
custom_transformer.fit_transform(x)

array([0.        , 0.69314718, 1.09861229, 1.09861229, 1.38629436])

# Ex.

In [146]:
# in the following, a padding function is used for customizing our
# transformer

In [147]:
def add_pad(mat, pad_x=1, pad_y=1):
    assert(pad_x>=0 and pad_y>=0)
    pad_mat = np.zeros((mat.shape[0]+2*pad_x, mat.shape[1]+2*pad_y))
    pad_mat[pad_x:mat.shape[0]+pad_x, pad_y:mat.shape[1]+pad_y] = mat
    return pad_mat

In [148]:
a = np.random.randint(low=1,high=3, size=(3,4))
add_pad(a, 1, 1)

array([[0., 0., 0., 0., 0., 0.],
       [0., 2., 2., 2., 2., 0.],
       [0., 1., 2., 1., 1., 0.],
       [0., 2., 1., 2., 1., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [143]:
custom_transformer = FunctionTransformer(add_pad)
custom_transformer

In [145]:
a = np.random.randint(low=1,high=3, size=(3,4))
custom_transformer.fit_transform(a)

array([[0., 0., 0., 0., 0., 0.],
       [0., 1., 2., 2., 1., 0.],
       [0., 2., 1., 2., 1., 0.],
       [0., 1., 2., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [149]:
# it can be used for pipelining images when they need padding
# or any other functions to be applied on them before, within or after
# piplelines...

In [150]:
# just give the name of that function to 'FunctionTransformer'...