In [None]:
%config IPCompleter.use_jedi = False

In [None]:
import pandas as pd
train = pd.read_csv('../resources/data/train_ctrUa4K.csv')
test = pd.read_csv('../resources/data/test_lAUu6dG.csv')
train = train.drop('Loan_ID', axis=1)
train.dtypes

In [None]:
train.shape

In [None]:
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_transformer = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler)])

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
cat_onehot = OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('onehot', cat_onehot)])

In [None]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
import time

start = time.time()
Xt = preprocessor.fit(X_train)
end = time.time()
print('Time taken: ' + str(end - start))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [None]:
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator

class ScaleTestEstimator(ClassifierMixin, BaseEstimator):
    num_iters = 100
    classifier : ClassifierMixin = None

    def __init__(self, num_iters, classifier: ClassifierMixin):
        self.num_iters = num_iters
        self.classifier = classifier

    def fit(self, X, y):
        for i in range(self.num_iters):
            self.classifier.fit(X, y)
        return self
            
    def predict(self, X):
        return self.classifier.predict(X)

    def score(self, X, y, sample_weight=None):
        return self.classifier.score(X, y, sample_weight)

In [None]:
Xt = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

In [None]:
classifiers[0]

In [None]:
c_a = ScaleTestEstimator(50, DecisionTreeClassifier())
c_b = ScaleTestEstimator(50, RandomForestClassifier())
c_c = ScaleTestEstimator(50, GradientBoostingClassifier())
classifiers = [c_a, c_b, c_c]

In [None]:
import sklearn.base as base

In [None]:
base.is_classifier(c_a)

In [None]:
base.clone(c_a)

In [None]:
import time
start = time.time()

c_a = ScaleTestEstimator(50, DecisionTreeClassifier())
c_b = ScaleTestEstimator(50, RandomForestClassifier())
c_c = ScaleTestEstimator(50, GradientBoostingClassifier())
classifiers = [c_a, c_b, c_c]

classifier_results=[]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    pipe.predict(X_train)
    
end = time.time()
tt = end - start
print('time taken: ' + str(tt))

In [None]:
c_a.classifier.feature_importances_

In [None]:
import ray
ray.shutdown()

In [None]:
ray.init()

In [None]:
import codeflare.pipelines.Datamodel as dm

In [None]:
pipeline = dm.Pipeline()

In [None]:
node_a = dm.EstimatorNode('preprocess', preprocessor)
node_b = dm.EstimatorNode('c_a', c_a)
node_c = dm.EstimatorNode('c_b', c_b)
node_d = dm.EstimatorNode('c_c', c_c)

In [None]:
pipeline.add_edge(node_a, node_b)
pipeline.add_edge(node_a, node_c)
pipeline.add_edge(node_a, node_d)

In [None]:
# create input
pipeline_input = dm.PipelineInput()
xy = dm.Xy(X_train, y_train)
pipeline_input.add_xy_arg(node_a, xy)

In [None]:
import codeflare.pipelines.Runtime as rt

In [None]:
from codeflare.pipelines.Runtime import ExecutionType

In [None]:
start = time.time()

pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)

node_b_output = pipeline_output.get_xyrefs(node_b)
node_c_output = pipeline_output.get_xyrefs(node_c)
node_d_output = pipeline_output.get_xyrefs(node_d)

end = time.time()
print ('Time taken: ' + str(end - start))

In [None]:
node_b_output

In [None]:
selected_pipeline = rt.select_pipeline(pipeline_output, node_b_output[0])

In [None]:
print(selected_pipeline)