In [None]:
"""
pipeline的方式来转，这个更加规整，一般应该有这种方式。
定义一系列的transformer，然后将数据集一次性传进去，最后直接获得一个dataframe，然后转为ndarry作为模型输入
对于dl的模型，最后的分类器可能不放到pipeline里。
对于sklearn的模型，可以放到里面一起去。
"""
import pandas as pd
import numpy as np


# 构建测试的dataframe来测试流水线方法
df2 = pd.DataFrame({
    "city": ["tokyo", "beijing", "london", "seattle", "san fransciso", "newyork"],
    "boolean": ["yes", "no", "no", "no", "no", "yes"],
    "ordinal": ["somewhat like", "like", "somewhat like", "like", "somewhat like", "dislike"],
    "quantitative": [1, 11, -.5, 10, 8.3, 20],
    "gdp": [1122, 139, 1500, 1234, 555, 99999],
    "day": [1, 2, 3, 4, 5, 6]
}
)
df2
df2_copy = df2.copy()

In [None]:
# 分类属性处理
df2 = pd.get_dummies(df2, columns=["city", "boolean"], prefix_sep="__")
df2

In [None]:
# 定序属性 - 可以认为某些列的值比另外更好，实际运用中，是不是one hot编码也可以，尤其是DL
ordering = ["dislike", "somewhat like", "like"]
df2["ordinal"] = df2["ordinal"].map(lambda x: ordering.index(x))
df2

In [None]:
# 数据分箱
df2["quantitative"] = pd.cut(df2["quantitative"], bins=3, labels=False)
df2

In [None]:
df2_copy

In [None]:
# pipeline方式来转换，更好的组织 
# 实际生产中应该采取这种方式
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

class MyDummy(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols

    def transform(self, X):
        temp = X.copy()
        return pd.get_dummies(X, columns=self.cols)
    
    def fit(self, *_):
        return self
    
class MyOrdinal(TransformerMixin):
    def __init__(self, ordering=None):
        self.ordering = ordering
    
    def transform(self, X):
        temp = X.copy()
        temp["ordinal"] = X["ordinal"].map(lambda x: self.ordering.index(x))
        return temp
    
    def fit(self, *_):
        return self
    
class MyCut(TransformerMixin):
#     def __init__(self):
    
    def transform(self, X):
        temp = X.copy()
        temp["quantitative"] = pd.cut(X["quantitative"], bins=3, labels=False)
        return temp
    
    def fit(self, *_):
        return self

class MyStandard(TransformerMixin):
    """
    这个方式感觉有点stupid，fit_transform返回的是一个ndarray，还需要转回dataframe
    """
    def transform(self, X):
        standard_scaler = StandardScaler()
        temp = X["gdp"].copy()
        temp = temp.to_frame()
        temp_standard = standard_scaler.fit_transform(temp)
        temp_standard_df = pd.DataFrame(data=temp_standard, columns=["gdp"])
        X["gdp"] = temp_standard_df["gdp"]
        return X
    
    def fit(self, *_):
        return self
    
class MyDayTransfromer(TransformerMixin):
    """
    """
    def transform(self, X):
        X["day"] = X["day"].apply(self.reset_day)
        return X
    
    def fit(self, *_):
        return self
    
    def reset_day(self, day):
        if day == 1 or day == 2 or day == 3:
            return "first"
        else: 
            return "second"
    

my_dummy = MyDummy(cols=["city", "boolean"])
my_ordinal = MyOrdinal(ordering=ordering)
my_cut = MyCut()
my_standard = MyStandard()
my_day_transformer = MyDayTransfromer()

pipe = Pipeline([("dummify", my_dummy), ("ordinal", my_ordinal), ("cut", my_cut), 
                 ("standard", my_standard), ("day_transformer", my_day_transformer)])
df2_copy

In [None]:
df3 = pipe.fit_transform(df2_copy)
df3
# df2和df3是一样的结果 