In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  StandardScaler, LabelBinarizer
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml

In [None]:
data = pd.read_csv('census_income.csv', sep=',')

Select features and split into train and test:

In [None]:
features = ['age','workclass','sex','hours-per-week']

X_train, X_test, y_train, y_test = train_test_split(
    data[features], 
    data['outcome'].str.contains('>50K').astype(int), 
    test_size=0.25)

Define pre-processing steps for categorical and numerical variables and specify pipeline:

In [None]:
cat_cols = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()
num_cols = X_train.dtypes[X_train.dtypes != 'object'].index.tolist()

mapper = DataFrameMapper(
    [([col], [SimpleImputer(strategy='median'), 
              StandardScaler()])                for col in num_cols] +
    [([col], LabelBinarizer())                  for col in cat_cols], 
    df_out=True
)

pipeline = Pipeline([
    ('mapper', mapper),
    ('selector', SelectKBest(k=10)),
    ('classifier', MLPClassifier()) 
])

Fit pipeline to train:

In [None]:
pipeline.fit(X_train, y_train)

Apply pipeline to test. Get predictions:

In [None]:
predictions = pipeline.predict(X_test)

Convert pipeline to PMML and store to disk for use elsewhere:

In [None]:
PMML_pipeline = make_pmml_pipeline(pipeline)

sklearn2pmml(PMML_pipeline, "PMML_pipeline.pmml")