1- import pytest
2-
31import codeflare .pipelines .Datamodel as dm
42import codeflare .pipelines .Runtime as rt
53
64import numpy as np
7- from sklearn .preprocessing import FunctionTransformer
85from sklearn .preprocessing import MinMaxScaler
96import os
7+ import pandas as pd
8+ from sklearn .pipeline import Pipeline
9+ from sklearn .impute import SimpleImputer
10+ from sklearn .preprocessing import StandardScaler , OneHotEncoder
11+ from sklearn .ensemble import RandomForestClassifier , GradientBoostingClassifier
12+
13+ import ray
1014
1115
1216class FeatureUnion (dm .AndTransform ):
@@ -47,8 +51,7 @@ def test_save_load():
4751 r_fh = open (fname , 'rb' )
4852 saved_pipeline = dm .Pipeline .load (r_fh )
4953 pre_edges = saved_pipeline .get_pre_edges (node_c )
50- assert (len (pre_edges ) == 2 )
51-
54+ assert (len (pre_edges ) == 2 )
5255 os .remove (fname )
5356
5457
@@ -58,4 +61,76 @@ def test_runtime_save_load():
5861 captured accurately
5962 :return:
6063 """
61-
64+ train = pd .read_csv ('../../../resources/data/train_ctrUa4K.csv' )
65+ train = train .drop ('Loan_ID' , axis = 1 )
66+
67+ X = train .drop ('Loan_Status' , axis = 1 )
68+ y = train ['Loan_Status' ]
69+ from sklearn .model_selection import train_test_split
70+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 )
71+ imputer = SimpleImputer (strategy = 'median' )
72+ scaler = StandardScaler ()
73+
74+ numeric_transformer = Pipeline (steps = [
75+ ('imputer' , imputer ),
76+ ('scaler' , scaler )])
77+
78+ cat_imputer = SimpleImputer (strategy = 'constant' , fill_value = 'missing' )
79+ cat_onehot = OneHotEncoder (handle_unknown = 'ignore' )
80+
81+ categorical_transformer = Pipeline (steps = [
82+ ('imputer' , cat_imputer ),
83+ ('onehot' , cat_onehot )])
84+ numeric_features = train .select_dtypes (include = ['int64' , 'float64' ]).columns
85+ categorical_features = train .select_dtypes (include = ['object' ]).drop (['Loan_Status' ], axis = 1 ).columns
86+ from sklearn .compose import ColumnTransformer
87+ preprocessor = ColumnTransformer (
88+ transformers = [
89+ ('num' , numeric_transformer , numeric_features ),
90+ ('cat' , categorical_transformer , categorical_features )])
91+
92+ classifiers = [
93+ RandomForestClassifier (),
94+ GradientBoostingClassifier ()
95+ ]
96+ pipeline = dm .Pipeline ()
97+ node_pre = dm .EstimatorNode ('preprocess' , preprocessor )
98+ node_rf = dm .EstimatorNode ('random_forest' , classifiers [0 ])
99+ node_gb = dm .EstimatorNode ('gradient_boost' , classifiers [1 ])
100+
101+ pipeline .add_edge (node_pre , node_rf )
102+ pipeline .add_edge (node_pre , node_gb )
103+
104+ import ray
105+ ray .shutdown ()
106+ ray .init ()
107+ pipeline_input = dm .PipelineInput ()
108+ xy = dm .Xy (X_train , y_train )
109+ pipeline_input .add_xy_arg (node_pre , xy )
110+
111+ pipeline_output = rt .execute_pipeline (pipeline , rt .ExecutionType .FIT , pipeline_input )
112+ node_rf_xyrefs = pipeline_output .get_xyrefs (node_rf )
113+
114+ # save this pipeline for random forest and load and then predict on test data
115+ fname = 'random_forest.cfp'
116+ w_fh = open (fname , 'wb' )
117+ rt .save (pipeline_output , node_rf_xyrefs [0 ], w_fh )
118+ w_fh .close ()
119+
120+ # load it
121+ r_fh = open (fname , 'rb' )
122+ saved_pipeline = dm .Pipeline .load (r_fh )
123+ nodes = saved_pipeline .get_nodes ()
124+ # this should not exist in the saved pipeline
125+ assert (node_gb .get_node_name () not in nodes .keys ())
126+
127+ # should be preditable as well
128+ predict_pipeline_input = dm .PipelineInput ()
129+ predict_pipeline_input .add_xy_arg (node_pre , dm .Xy (X_test , y_test ))
130+ try :
131+ predict_pipeline_output = rt .execute_pipeline (saved_pipeline , rt .ExecutionType .PREDICT , predict_pipeline_input )
132+ predict_pipeline_output .get_xyrefs (node_rf )
133+ except Exception :
134+ assert False
135+
136+ os .remove (fname )
0 commit comments