# Pipeline

In [37]:
import pandas as pd
import numpy as np


data = pd.DataFrame([ ["<10", 'a', 1, 'b', 2, "long text", "YES"], 
                      [">10", 'b', 2, 'c', 3, "not very long text", "NO"], 
                      ["<10", None, 2, 'c', None, "very long text", "YES"],
                      [">10", 'a', 2, 'c', 4, "very short text", "YES"],
                      [">10", 'a', 2, 'c', 7, "very short text", "YES"],
                      [">10", 'a', 5, 'c', 4, "very short text", "YES"],
                      ["<10", 'a', 11, 'c', 4, "very short text", "YES"],
                      [">10", 'a', 12, 'c', 4, "very short text", "YES"],
                      ["<10", 'b', 2, 'a', None, "short text", "NO"], 
                      ["<10", 'a', 7, 'c', 8, "very short text", "YES"],
                      [">10", 'a', 1, 'c', 5, "very short text", "YES"],
                      ["<10", 'b', 5, 'a', None, "short text", "NO"]],                     
                      columns = ['k0','k1', 'v1', 'k2', 'v2', 's1', 'target'])

data['s1'] = data['s1'].astype(str)
data.head()

Unnamed: 0,k0,k1,v1,k2,v2,s1,target
0,<10,a,1,b,2.0,long text,YES
1,>10,b,2,c,3.0,not very long text,NO
2,<10,,2,c,,very long text,YES
3,>10,a,2,c,4.0,very short text,YES
4,>10,a,2,c,7.0,very short text,YES


In [38]:
data.dtypes

k0         object
k1         object
v1          int64
k2         object
v2        float64
s1         object
target     object
dtype: object

# Zad
Sprawdzamy, czy mam jakieś missing data.

In [39]:
data.isnull().sum()

k0        0
k1        1
v1        0
k2        0
v2        3
s1        0
target    0
dtype: int64

# Zad
Teraz rzućmy okiem na wszystkie atrybuty kategoryczne:

In [40]:
data["k0"].value_counts()

>10    6
<10    6
Name: k0, dtype: int64

In [41]:
data["k1"].value_counts()

a    8
b    3
Name: k1, dtype: int64

In [42]:
data["k2"].value_counts()

c    9
a    2
b    1
Name: k2, dtype: int64

In [43]:
data["s1"].value_counts()

very short text       7
short text            2
very long text        1
not very long text    1
long text             1
Name: s1, dtype: int64

# Zad

Podzielmy atrybuty na część 
* objaśniającą $X$
* objaśnianą $y$

In [44]:
X = data.drop(['target'], axis=1)
y = data['target'].values
print(X.shape)
print(y.shape)

(12, 6)
(12,)


# Zad 
Przyjrzyjmy się $y$. Musi to być kolumna numeryczna z labealmi.

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [45]:
print(y)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

print(y)

['YES' 'NO' 'YES' 'YES' 'YES' 'YES' 'YES' 'YES' 'NO' 'YES' 'YES' 'NO']
X.shape: (12, 6) y.shape: (12,)
[1 0 1 1 1 1 1 1 0 1 1 0]


In [46]:
# yy = np.array(['YES', 'NO', 'YES', 'NO', "NO"], dtype=object)
# yy
# label_encoder.transform(yy)

# Zad
Podzielmy zbiór na train/test

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Teraz zbudujmy nasze **pipeline** preprocessingu. 

# Zad

* Inaczej będziemy postępować z danymi: 
  * kategorycznymi, 
  * numerycznymi.
  
Wykorzystamy **DataframeSelector** aby wybrać określone atrybuty z DataFrame.

* since Scikit-Learn doesn't handle DataFrames yet

## Zaczniemy o dwybrania kolumn numerycznych

In [48]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names]

# Zad. 
Sprawdźmy jak działa:
* wybierz kategorie ['v1']
* wybierz kategorie ['v1', 'v2']

Zauważ, że nasz Pipeline może sam usuwać niektóre kolumny po przez nie wybieranie ich.


In [49]:
from sklearn.pipeline import Pipeline
pipeline_1 = Pipeline([
        ("select_cat", DataFrameSelector(['v1']))
    ])

pipeline_1.fit_transform(X_train)

Unnamed: 0,v1
1,2
11,5
4,2
7,12
3,2
6,11


In [50]:
from sklearn.pipeline import Pipeline
pipeline_1 = Pipeline([
        ("select_cat", DataFrameSelector(['v1', 'v2']))
    ])

pipeline_1.fit_transform(X_train)

Unnamed: 0,v1,v2
1,2,3.0
11,5,
4,2,7.0
7,12,4.0
3,2,4.0
6,11,4.0


# Zad
Zbudujmy ostateczny **pipeline** dla atrybutów numerycznych:
* wybierz kategorie ['v1', 'v2']
* usuwa missing data 
```python
from sklearn.impute import SimpleImputer
```

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(['v1', 'v2'])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [52]:
num_pipeline.fit_transform(X_train)

array([[ 2.,  3.],
       [ 5.,  4.],
       [ 2.,  7.],
       [12.,  4.],
       [ 2.,  4.],
       [11.,  4.]])

## Kolumny kategoryczne

# Zad. 
Sprawdźmy jak działa wybór kolumn:
* wybierz kategorie ['k0', 'k1', 'k2']
* wybierz kategorie ['k0', 'k1', 'k2', 's1']

Zauważ, że nasz Pipeline sam usuwa niektóre kolumny po przez nie wybieranie ich.


In [53]:
pipeline_1 = Pipeline([
        ("select_cat", DataFrameSelector(['k0', 'k1', 'k2']))
    ])

pipeline_1.fit_transform(X_train)

Unnamed: 0,k0,k1,k2
1,>10,b,c
11,<10,b,a
4,>10,a,c
7,>10,a,c
3,>10,a,c
6,<10,a,c


In [54]:
pipeline_1 = Pipeline([
        ("select_cat", DataFrameSelector(['k0', 'k1', 'k2', 's1']))
    ])

pipeline_1.fit_transform(X_train)

Unnamed: 0,k0,k1,k2,s1
1,>10,b,c,not very long text
11,<10,b,a,short text
4,>10,a,c,very short text
7,>10,a,c,very short text
3,>10,a,c,very short text
6,<10,a,c,very short text


# Zad.
Będziemy także potrzebować imputera do kategorycznych kolumn napisowych (zwykły Imputer nie działa na tych kolumnach).

### a)
wylicz najczęściej występująca wartość w każdej kolumnie np.array


In [55]:
[X_train[c].value_counts().index[0] for c in X_train]

['>10', 'a', 2, 'c', 4.0, 'very short text']

### b)
Wypełnij missing data najczęściej występującymi elementami.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html

In [56]:
X_train.fillna(pd.Series([X_train[c].value_counts().index[0] for c in X_train], index=X.columns))

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,4.0,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text
6,<10,a,11,c,4.0,very short text


### c)
Zbuduj transformer wypełniający missing data najczęściej występującymi elementami.


In [57]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

### d)
Zobaczmy jak działa nasz Pipeline

In [58]:
pipeline_3 = Pipeline([
        ("select_cat", DataFrameSelector(['k0', 'k1', 'k2', 's1'])),
        ("imputer", MostFrequentImputer())
    ])

In [59]:
X_train

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text
6,<10,a,11,c,4.0,very short text


In [60]:
pipeline_3.fit_transform(X_train)

Unnamed: 0,k0,k1,k2,s1
1,>10,b,c,not very long text
11,<10,b,a,short text
4,>10,a,c,very short text
7,>10,a,c,very short text
3,>10,a,c,very short text
6,<10,a,c,very short text


# Dane kategoryczne
Niektóre wartości kategoryczne są w naturalnej kolejności/porządku. Można te dane sortować/porządkować w kolejności rosnącej/malejącej np. w kolumnie <tt> grade </tt> przydzielona jest ocena od A do G gdzie A oznacza mniej ryzykowaną inwestycję niż B.

$A <B <C <D <E <F <G$

Niektórych wartości nominalnych nie można uporządkować np. columna <tt> purpose</tt> 
Nie można napisać:

$car < wedding < education < moving < house$


Teraz możemy zbudować pipeline dla atrybutów kategorycznych.

We can convert each categorical value to a one-hot vector using a OneHotEncoder. Right now this class can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see PR https://github.com/scikit-learn/scikit-learn/issues/10521). So for now we import it from future_encoders.py

# Zad
Zamieńmy kolumny ['k1', 'k2'] na One Hot Encoding

In [61]:
import sklearn
print(sklearn.__version__)

# from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

1.0


In [62]:
X_train.head()

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text


In [63]:
pipeline_4 = Pipeline([
        ("select_cat", DataFrameSelector(['k1', 'k2'])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

pipeline_4.fit_transform(X_train)

array([[0., 1., 0., 1.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.],
       [1., 0., 0., 1.]])

# Zad
Zamieńmy kolumę ['k0'] na odpowienio:

* '<10': 1
* '>10': 2

In [64]:
X_train

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text
6,<10,a,11,c,4.0,very short text


### a) 
Zamień elementy z 
```python
X_train 
```
za momocą słowniko
```python
dic = {'<10': 1, '>10': 2}
```

In [65]:
dic = {'<10': 1, '>10': 2}
X_train.replace(dic)

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,2,b,2,c,3.0,not very long text
11,1,b,5,a,,short text
4,2,a,2,c,7.0,very short text
7,2,a,12,c,4.0,very short text
3,2,a,2,c,4.0,very short text
6,1,a,11,c,4.0,very short text


# Zad
Wykonaj transformer

In [66]:
class DictionaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, dictionary):
        self.dictionary = dictionary
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.replace(self.dictionary).values

In [67]:
pipeline_5 = Pipeline([
        ("select_cat", DataFrameSelector(['k0'])),
        ("dictionary_encoder", DictionaryEncoder({'<10': 1, '>10': 2})),
    ])

pipeline_5.fit_transform(X_train)

array([[2],
       [1],
       [2],
       [2],
       [2],
       [1]], dtype=int64)

# Zad
Niektóre atrybuty są ciężkie do przerobienia na format liczbowy np. texty.

* Zamieńmy kolumnę  ['s1'] na kolumnę mówiącą ile zawiera słów.


In [68]:
[len(text) for text in  data.s1]

[9, 18, 14, 15, 15, 15, 15, 15, 10, 15, 15, 10]

In [69]:
data_temp = pd.DataFrame([ ["<10",  "long text"], 
                      [">10",  "very long text"]],
                    
                      columns = ['k0','k1'])

X_res= pd.DataFrame()
for c in data_temp.columns:
    temp = [len(text.split()) for text in  data_temp[c]]
    X_res[c]=temp 
X_res

Unnamed: 0,k0,k1
0,1,2
1,1,3


In [70]:
X_train

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text
6,<10,a,11,c,4.0,very short text


In [73]:
class WordNumberEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_res= pd.DataFrame()
        for c in X.columns:
            temp = [len(text.split()) for text in  X[c]]
            X_res[c]=temp
        return X_res.values

In [74]:
pipeline_6 = Pipeline([
        ("select_cat", DataFrameSelector(['s1'])),
        ("dictionary_encoder", WordNumberEncoder()),
    ])

pipeline_6.fit_transform(X_train)  

array([[4],
       [2],
       [3],
       [3],
       [3],
       [3]], dtype=int64)

# Zad
Na koniec połączmy powyższe podejścia

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html

In [76]:
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(['v1', 'v2'])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

cat_one_hot_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(['k1', 'k2'])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])


cat_dictionary_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(['k0'])),
        ("dictionary_encoder", DictionaryEncoder({'<10': 1, '>10': 2})),
    ])

cat_word_number_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(['s1'])),
        ("dictionary_encoder", WordNumberEncoder()),
    ])

In [77]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_one_hot_pipeline", cat_one_hot_pipeline),
        ("cat_dictionary_pipeline", cat_dictionary_pipeline),
        ("cat_word_number_pipeline", cat_word_number_pipeline)
    ])

Teraz mamy fajny **pipeline** przetwarzania wstępnego, który pobiera dane wejściowe i zwraca dane wyjściowe złorzone z liczb, które możemy podać do dowolnego modelu uczenia maszynowego.

In [78]:
preprocess_pipeline.fit_transform(X_train)

array([[ 2.,  3.,  0.,  1.,  0.,  1.,  2.,  4.],
       [ 5.,  4.,  0.,  1.,  1.,  0.,  1.,  2.],
       [ 2.,  7.,  1.,  0.,  0.,  1.,  2.,  3.],
       [12.,  4.,  1.,  0.,  0.,  1.,  2.,  3.],
       [ 2.,  4.,  1.,  0.,  0.,  1.,  2.,  3.],
       [11.,  4.,  1.,  0.,  0.,  1.,  1.,  3.]])

# Zad
Wykonaj reprezentację kolumny ['s1'] za pomocą 

* CountVectorizer
```python
from sklearn.feature_extraction.text import CountVectorizer
```

* TfidfVectorizer
```python
from sklearn.feature_extraction.text import TfidfVectorizer 
```

 

In [41]:
X_train

Unnamed: 0,k0,k1,v1,k2,v2,s1
1,>10,b,2,c,3.0,not very long text
11,<10,b,5,a,,short text
4,>10,a,2,c,7.0,very short text
7,>10,a,12,c,4.0,very short text
3,>10,a,2,c,4.0,very short text
6,<10,a,11,c,4.0,very short text


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
class ToListEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.values.T.tolist()[0]


pipeline_7 = Pipeline([
        ("select_cat", DataFrameSelector(['s1'])),
        ("to_numpy", ToListEncoder()),
        ("dictionary_encoder", CountVectorizer()),
    ])

X_tr = pipeline_7.fit_transform(X_train)
X_tr
X_tr.todense()

matrix([[1, 1, 0, 1, 1],
        [0, 0, 1, 1, 0],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 1]], dtype=int64)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
class ToListEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.values.T.tolist()[0]


pipeline_8 = Pipeline([
        ("select_cat", DataFrameSelector(['s1'])),
        ("to_numpy", ToListEncoder()),
        ("dictionary_encoder", TfidfVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b')),
    ])

X_tr = pipeline_8.fit_transform(X_train)
X_tr
X_tr.todense()

matrix([[0.91399636, 0.        , 0.40572238],
        [0.        , 0.75577461, 0.65483184],
        [0.        , 0.75577461, 0.65483184],
        [0.        , 0.75577461, 0.65483184],
        [0.        , 0.75577461, 0.65483184],
        [0.        , 0.75577461, 0.65483184]])

# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla

* SVM liniowego

### Oczywiście na 5 punktach to nie ma sensu ale pokarzemy jak to się robi


In [80]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=2)

In [81]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear'))])


param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_1.fit(X_train, y_train)
grid_1.best_params_

{'classifier__C': 0.001}