# Building function compositions for data processing

Creation of three basic functions and how to compose a pipeline

In [2]:
import numpy as np

Let's define a function to add 3 to each element of the array:

In [3]:
def add3(input_array):
    return map(lambda x: x+3, input_array)

Let's define a second function to multiply 2 with each element of the array:

In [4]:
def mul2(input_array):
    return map(lambda x: x*2, input_array)

Let's define a third function to subtract 5 from each element of the array:

In [5]:
def sub5(input_array):
    return map(lambda x: x-5, input_array)

Let's define a function composer that takes functions as input arguments and returns a compsed funciton. This composed funciton is basically a function athat applies all the input funciotns in sequence:

In [6]:
def function_composer(*args):
    return reduce(lambda f, g: lambda x: f(g(x)), args)

We use the `reduce` function to combine all the input functions by successively applying the functions in sequence.

We are not ready to play with this function composer. Let's define some data and a sequence of operations:

In [7]:
arr = np.array([2,5, 4, 7])

Operation: `add3(mul2(sub5(arr))))`

In [9]:
arr1 = add3(arr)
arr1

[5, 8, 7, 10]

In [10]:
arr2 = mul2(arr1)
arr2

[10, 16, 14, 20]

In [11]:
arr3 = sub5(arr2)
arr3

[5, 11, 9, 15]

Let's use the function composer to achieve the same thing in a single line:

In [12]:
func_composed = function_composer(sub5, mul2, add3)

In [14]:
func_composed(arr)

[5, 11, 9, 15]

# Building machine learning pipelines

scikit-learn library - has provisions to build machine learning pipelines

In this recipe, we will be building a pipeline to take the input feature vector, select the top k features, and then classify them useing a random forest classifier.

In [1]:
# 1. Create a new Python file, and import the following packages:
from sklearn.datasets import samples_generator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

In [2]:
# 2. Let's generate some data to play with:
X, y = samples_generator.make_classification(n_informative=4, n_features=20, n_redundant=0, random_state=5)

In [3]:
X

array([[ 1.71357595,  1.43919732, -0.52094767, ...,  0.27490402,
         0.51469366,  2.03968004],
       [-1.06488863, -0.39326245, -0.91041446, ...,  0.04637225,
        -0.28661553,  0.69574385],
       [-0.89820155,  0.01047972, -0.87577198, ..., -0.86316078,
         1.54619738, -1.78406817],
       ..., 
       [ 0.61753879, -0.86202115,  0.20390332, ..., -1.13686311,
         0.15705362, -1.3560183 ],
       [ 0.21642588, -1.4321928 ,  1.95621557, ...,  1.17068241,
         0.40719796, -0.19133817],
       [ 0.83342107,  0.93122077, -0.55606396, ...,  1.87859673,
        -1.75380958,  0.51710249]])

In [4]:
y

array([1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1])

This line generated 20 dimensional feature vectors because this is the default value. you can change it using the `n_features` parameter in the previous line.

In [5]:
# Our first step of the pipeline is to select the k best features and before 
# the datapoint is used further. In this case, let's set k to 10:
selector_k_best = SelectKBest(f_regression, k=10)

In [6]:
# Random forest classifier
classifier = RandomForestClassifier(n_estimators=50, max_depth=4)

In [7]:
# build the machine learning pipeline
pipeline_classifier = Pipeline([('selector', selector_k_best), ('rf', classifier)])

In [8]:
pipeline_classifier.set_params(selector__k=6, rf__n_estimators=25)

Pipeline(steps=[('selector', SelectKBest(k=6, score_func=<function f_regression at 0x10b6b6848>)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [9]:
pipeline_classifier.fit(X, y)

Pipeline(steps=[('selector', SelectKBest(k=6, score_func=<function f_regression at 0x10b6b6848>)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [10]:
prediction = pipeline_classifier.predict(X)

In [11]:
prediction

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1])

In [12]:
score = pipeline_classifier.score(X, y)

In [13]:
score

0.95999999999999996

In [14]:
# Print the selected features chosen by the selector
features_status = pipeline_classifier.named_steps['selector'].get_support()

In [15]:
selected_features = []

In [16]:
for count, item in enumerate(features_status):
    if item:
        selected_features.append(count)

In [17]:
print "\nSelected features (0-indexed): ", ', '.join([str(x) for x in selected_features])


Selected features (0-indexed):  0, 5, 9, 10, 11, 15


In [18]:
selected_features

[0, 5, 9, 10, 11, 15]