In [None]:
%matplotlib inline


# Concatenating multiple feature extraction methods


In many real-world examples, there are many ways to extract features from a
dataset. Often it is beneficial to combine several methods to obtain good
performance. This example shows how to use ``FeatureUnion`` to combine
features obtained by PCA and univariate selection.

Combining features using this transformer has the benefit that it allows
cross validation and grid searches over the whole process.

The combination used in this example is not particularly helpful on this
dataset and is only used to illustrate the usage of FeatureUnion.


In [2]:
# Author: Andreas Mueller <amueller@ais.uni-bonn.de>
#
# License: BSD 3 clause

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from preprocessing.Normalize import Normalize
import helper.SeriesHelper as series_helper

In [3]:
normal_matrix = Normalize().get_normalized_data()
X = normal_matrix.to_numpy()
y = series_helper.get_relapse_value_from_series_matrix(normal_matrix)

In [100]:
# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=200)

# Maybe some original features where good, too?
selection = SelectKBest(k=300)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)],n_jobs=6)

# Use combined features to transform dataset:
X_features = combined_features.fit(X_train, y_train).transform(X_train)
print("Combined space has", X_features.shape[1], "features")

svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ("svm", svm)])

param_grid = dict(features__pca__n_components=[100, 200, 300,400,500,600,700,1000],
                  features__univ_select__k=[100,200, 300,400,500,600,700,800,1000],
                  svm__C=[0.01,0.1,0.5, 1,5, 10])

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10,n_jobs=6)
grid_search.fit(X_train, y_train)
print("***********Best Estimator :: ***********************")
print(grid_search.best_estimator_)

Combined space has 500 features
Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   14.0s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   18.2s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   55.1s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  3.2min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  5.0min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  5.8min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:  6.8min


In [96]:
grid_search.score(X_train,y_train)

0.7066167290886392

In [97]:
grid_search.score(X_test,y_test)

0.6717171717171717

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

In [28]:
print(grid_search.best_estimator_.score(X,y))

0.6950710108604845


In [86]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [93]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l2",dual=False))),
  ('classification', KNeighborsClassifier())
])
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     loss='squared_hinge',
                                                     max_iter=1000,
                                                     multi_class='ovr',
                                                     penalty='l2',
                                                     random_state=None,
                                                     tol=0.0001, verbose=0),
                                 max_features=None, norm_order=1, prefit=False,
                                 threshold=None)),
                ('classification',
                 KNeighborsClassifier(algorithm='aut

In [94]:
clf.score(X_train,y_train)

0.7578027465667915

In [95]:
clf.score(X_test,y_test)

0.6035353535353535

In [None]:

import multiprocessing
from itertools import product

def merge_names(a, b):
    return '{} & {}'.format(a, b)

if __name__ == '__main__':
    names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie']
    with multiprocessing.Pool(processes=3) as pool:
        results = pool.starmap(merge_names, product(names, repeat=2))
    print(results)

# Output: ['Brown & Brown', 'Brown & Wilson', 'Brown & Bartlett', ...
