## Sample pipeline
This is a sample pipeline drawn from a competition posted on [Kaggle](https://www.kaggle.com/ragharamya/loanprediction). A preprocessor followed by exploring multiple options in parallel is demonstrated below.

In [1]:
%config IPCompleter.use_jedi = False

In [2]:
import pandas as pd
train = pd.read_csv('../resources/data/train_ctrUa4K.csv')
test = pd.read_csv('../resources/data/test_lAUu6dG.csv')
train = train.drop('Loan_ID', axis=1)
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [3]:
# prepare the dataset for training

X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## SKLearn pipeline
Below, we show how SKLearn is used to create a pipeline and then fit for each of the pipelines to explore multiple models.

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_transformer = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler)])

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
cat_onehot = OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('onehot', cat_onehot)])

In [5]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [7]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    RandomForestClassifier(),
    GradientBoostingClassifier()
    ]

In [18]:
classifier_results=[]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    pipe.predict(X_train)

## CodeFlare pipelines
Below, we show how this can be done with CodeFlare pipelines approach.

In [19]:
import ray
ray.shutdown()

In [20]:
ray.init()

2021-06-20 11:53:04,117	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '9.211.53.245',
 'raylet_ip_address': '9.211.53.245',
 'redis_address': '9.211.53.245:6379',
 'object_store_address': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928',
 'metrics_export_port': 64339,
 'node_id': '8c45750634c387b09b787fb7a0aa191df63f6d5274861bad27c00cb3'}

In [21]:
import codeflare.pipelines.Datamodel as dm

In [31]:
pipeline = dm.Pipeline()

In [32]:
node_a = dm.EstimatorNode('preprocess', preprocessor)
node_0 = dm.EstimatorNode('node_0', classifiers[0])
node_1 = dm.EstimatorNode('node_1', classifiers[1])
node_2 = dm.EstimatorNode('node_2', classifiers[2])
node_3 = dm.EstimatorNode('node_3', classifiers[3])
node_4 = dm.EstimatorNode('node_4', classifiers[4])

pipeline.add_edge(node_a, node_0)
pipeline.add_edge(node_a, node_1)
pipeline.add_edge(node_a, node_2)
pipeline.add_edge(node_a, node_3)
pipeline.add_edge(node_a, node_4)

In [33]:
# create input
pipeline_input = dm.PipelineInput()
xy = dm.Xy(X_train, y_train)
pipeline_input.add_xy_arg(node_a, xy)

In [34]:
import codeflare.pipelines.Runtime as rt

In [35]:
from codeflare.pipelines.Runtime import ExecutionType

In [36]:
pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)
node_0_output = pipeline_output.get_xyrefs(node_0)

In [37]:
outputs[0]

[<codeflare.pipelines.Datamodel.XYRef at 0x7f97087200d0>]

In [39]:
X_out = ray.get(outputs[0][0].get_Xref())

In [40]:
X_out

array(['Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y