In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, Normalizer,
                                   MaxAbsScaler, RobustScaler, Binarizer, QuantileTransformer)

import pandas as pd

In [5]:
!dir

 Volume in drive C is Windows8_OS
 Volume Serial Number is 1409-1A11

 Directory of C:\Users\rmdelgad\Documents\repos\sklearn-bamboo\bamboo

04/07/2018  09:28 AM    <DIR>          .
04/07/2018  09:28 AM    <DIR>          ..
04/07/2018  08:35 AM    <DIR>          .ipynb_checkpoints
04/07/2018  09:28 AM             4,576 alcohol.csv
04/07/2018  09:05 AM             1,930 preprocessing.py
04/07/2018  09:12 AM             3,172 Untitled.ipynb
04/07/2018  08:19 AM                 0 __init__.py
               4 File(s)          9,678 bytes
               3 Dir(s)  350,573,215,744 bytes free


In [81]:
class NumericRescalerDF(BaseEstimator, TransformerMixin):

    _transformer_class = None

    def __init__(self, subset_columns=(), *args, **kwargs):
        self.subset_columns = subset_columns
        self._transformer = self._transformer_class(*args, **kwargs)

    def _is_all_numeric(self, X):
        return set(X[self.subset_columns].select_dtypes(include=NUMPY_NUMERIC_DTYPES).columns) == set(self.subset_columns)

    def fit(self, X, y=None):
        if len(self.subset_columns) < 1:
            self.subset_columns = X.columns
        subset_df = X[self.subset_columns]
        if not self._is_all_numeric(subset_df):
            raise TypeError('The columns to transform must all be numeric.')

        self._transformer.fit(subset_df.values)
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy[self.subset_columns] = self._transformer.transform(X_copy[self.subset_columns].values)
        return X_copy

    def inverse_transform(self, X):
        X_copy = X.copy()
        X_copy[self.subset_columns] = self._transformer.inverse_transform(X_copy[self.subset_columns].values)
        return X_copy

In [82]:
class MinMaxScalerDF(NumericRescalerDF):
    _transformer_class = MinMaxScaler

class StandardScalerDF(NumericRescalerDF):
    _transformer_class = StandardScaler

class NormalizerDF(NumericRescalerDF):
    _transformer_class = Normalizer

class MaxAbsScalerDF(NumericRescalerDF):
    _transformer_class = MaxAbsScaler

class RobustScalerDF(NumericRescalerDF):
    _transformer_class = RobustScaler

class BinarizerDF(NumericRescalerDF):
    _transformer_class = Binarizer

class QuantileTransformerDF(NumericRescalerDF):
    _transformer_class = QuantileTransformer


In [52]:
df = pd.read_csv('alcohol.csv')
subset = ['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']
df.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,Afghanistan,0,0,0,0.0
1,Albania,89,132,54,4.9
2,Algeria,25,0,14,0.7
3,Andorra,245,138,312,12.4
4,Angola,217,57,45,5.9


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 5 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 7.6+ KB


In [54]:
mmx = MinMaxScalerDF(subset_columns=subset)
mmx.fit(df)

In [55]:
df_trans = mmx.transform(df)

In [56]:
df_trans.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,Afghanistan,0.0,0.0,0.0,0.0
1,Albania,0.236702,0.30137,0.145946,0.340278
2,Algeria,0.066489,0.0,0.037838,0.048611
3,Andorra,0.651596,0.315068,0.843243,0.861111
4,Angola,0.577128,0.130137,0.121622,0.409722


In [57]:
std = StandardScalerDF(subset_columns=subset)
std.fit(df)
df_std = std.transform(df)

In [83]:
df_transformers = (MinMaxScalerDF, StandardScalerDF, NormalizerDF, 
                   MaxAbsScalerDF, RobustScalerDF, BinarizerDF, 
                   QuantileTransformerDF)

for trans in df_transformers:
    trns = trans(subset_columns=subset)
    trns.fit(df)
    df_trans = trns.transform(df)

In [84]:
for trans in df_transformers:
    trns = trans(subset_columns=subset)
    df_trans = trns.fit_transform(df)