In [1]:
%config IPCompleter.use_jedi = False

In [2]:
import pandas as pd
train = pd.read_csv('../resources/data/train_ctrUa4K.csv')
test = pd.read_csv('../resources/data/test_lAUu6dG.csv')
train = train.drop('Loan_ID', axis=1)
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [3]:
train.shape

(614, 12)

In [4]:
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_transformer = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler)])

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
cat_onehot = OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('onehot', cat_onehot)])

In [6]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
import time

start = time.time()
Xt = preprocessor.fit(X_train)
end = time.time()
print('Time taken: ' + str(end - start))

Time taken: 0.016479015350341797


In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [9]:
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator

class ScaleTestEstimator(ClassifierMixin, BaseEstimator):
    num_iters = 100
    classifier : ClassifierMixin = None

    def __init__(self, num_iters, classifier: ClassifierMixin):
        self.num_iters = num_iters
        self.classifier = classifier

    def fit(self, X, y):
        for i in range(self.num_iters):
            self.classifier.fit(X, y)
        return self
            
    def predict(self, X):
        return self.classifier.predict(X)

    def score(self, X, y, sample_weight=None):
        return self.classifier.score(X, y, sample_weight)

In [10]:
Xt = preprocessor.fit_transform(X_train)

In [11]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

In [12]:
classifiers[0]

KNeighborsClassifier(n_neighbors=3)

In [13]:
c_a = ScaleTestEstimator(50, DecisionTreeClassifier())
c_b = ScaleTestEstimator(50, RandomForestClassifier())
c_c = ScaleTestEstimator(50, GradientBoostingClassifier())
classifiers = [c_a, c_b, c_c]

In [14]:
import sklearn.base as base

In [15]:
base.is_classifier(c_a)

True

In [16]:
base.clone(c_a)

ScaleTestEstimator(classifier=DecisionTreeClassifier(), num_iters=50)

In [17]:
import time
start = time.time()

c_a = ScaleTestEstimator(50, DecisionTreeClassifier())
c_b = ScaleTestEstimator(50, RandomForestClassifier())
c_c = ScaleTestEstimator(50, GradientBoostingClassifier())
classifiers = [c_a, c_b, c_c]

classifier_results=[]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    pipe.predict(X_train)
    
end = time.time()
tt = end - start
print('time taken: ' + str(tt))

time taken: 12.496086835861206


In [18]:
c_a.classifier.feature_importances_

array([0.21490202, 0.06557283, 0.23722477, 0.03139458, 0.31057042,
       0.00455733, 0.        , 0.00756582, 0.02029537, 0.00757656,
       0.        , 0.01031702, 0.00563211, 0.00079753, 0.00417938,
       0.        , 0.0051656 , 0.        , 0.01886081, 0.01271623,
       0.02171503, 0.        , 0.01450846, 0.00644814])

In [19]:
import ray
ray.shutdown()

In [20]:
ray.init()

2021-05-25 15:58:35,795	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '9.163.5.112',
 'raylet_ip_address': '9.163.5.112',
 'redis_address': '9.163.5.112:47719',
 'object_store_address': '/tmp/ray/session_2021-05-25_15-58-33_878617_17386/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-25_15-58-33_878617_17386/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2021-05-25_15-58-33_878617_17386',
 'metrics_export_port': 64892,
 'node_id': '48193ca06c6cfff4e2763a58cdef41cf1cebcad123a7d0f560d7dadd'}

In [21]:
import codeflare.pipelines.Datamodel as dm

In [22]:
pipeline = dm.Pipeline()

In [23]:
node_a = dm.EstimatorNode('preprocess', preprocessor)
node_b = dm.EstimatorNode('c_a', c_a)
node_c = dm.EstimatorNode('c_b', c_b)
node_d = dm.EstimatorNode('c_c', c_c)

In [24]:
pipeline.add_edge(node_a, node_b)
pipeline.add_edge(node_a, node_c)
pipeline.add_edge(node_a, node_d)

In [25]:
# create input
pipeline_input = dm.PipelineInput()
xy = dm.Xy(X_train, y_train)
pipeline_input.add_xy_arg(node_a, xy)

In [26]:
import codeflare.pipelines.Runtime as rt

In [27]:
from codeflare.pipelines.Runtime import ExecutionType

In [28]:
start = time.time()

pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)

node_b_output = pipeline_output.get_xyrefs(node_b)
node_c_output = pipeline_output.get_xyrefs(node_c)
node_d_output = pipeline_output.get_xyrefs(node_d)

end = time.time()
print ('Time taken: ' + str(end - start))

Time taken: 10.172047138214111


In [29]:
node_b_output

[<codeflare.pipelines.Datamodel.XYRef at 0x7f87f00fd0d0>]

In [30]:
selected_pipeline = rt.select_pipeline(pipeline_output, node_b_output[0])

In [31]:
print(selected_pipeline)

preprocess=
c_a=preprocess 

