In [1]:
import sklearn.datasets
import pandas

In [2]:
from sklearn_pandas import DataFrameMapper, make_dataframe_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

Gather a Tidy Dataframe
----

In [3]:
iris_data = sklearn.datasets.load_iris()
iris = pandas.DataFrame(data = iris_data["data"], columns=iris_data["feature_names"])
iris["class"] = iris_data["target_names"][iris_data["target"]]

iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Assemble a Simple Learning Pipeline
--------

A DataFramePipeline begins with a DataFrameMapper, specify how features **`X`** and targets **`y`** are extracted from an input frame. It ends with an estimator object.

In this case, extract each available feature without transformation and specify the class label as the target.

In [4]:
forest_pipeline = make_dataframe_pipeline([
        DataFrameMapper(iris_data["feature_names"], "class"),
        RandomForestClassifier(n_estimators=200)
    ])

logistic_pipeline = make_dataframe_pipeline([
        DataFrameMapper(iris_data["feature_names"], "class"),
        LogisticRegression()
    ])

Cross Validate
-----

Cross validation requires the target **`y`** to perform train-test splits. Use the pipeline's DataFrameMapper to extract the target feature array from input data. 

In [9]:
cross_val_result = pandas.DataFrame.from_dict({
    "forest" : cross_val_score(
        estimator = forest_pipeline,
        X = iris, y = forest_pipeline._dataframe_mapper.extract_y(iris),
        cv = 5, scoring="accuracy"),
    "logistic" : cross_val_score(
        estimator = logistic_pipeline,
        X = iris, y = logistic_pipeline._dataframe_mapper.extract_y(iris),
        cv = 5, scoring="accuracy")
    })

cross_val_result.describe()

Unnamed: 0,forest,logistic
count,5.0,5.0
mean,0.96,0.96
std,0.027889,0.043461
min,0.933333,0.9
25%,0.933333,0.933333
50%,0.966667,0.966667
75%,0.966667,1.0
max,1.0,1.0


Extract Feature Metadata
----

The DataFrameMapper may be used to associate estimator metadata with feature source information. In this case, the `feature_importances_` vector is associated with the source column name.

In [11]:
forest_pipeline.fit(iris)
pandas.Series(
    data = forest_pipeline._final_estimator.feature_importances_,
    index = forest_pipeline._dataframe_mapper.X_columns_,
    name="feature_importances"
)

sepal length (cm)    0.111118
sepal width (cm)     0.028009
petal length (cm)    0.455807
petal width (cm)     0.405066
Name: feature_importances, dtype: float64