In [1]:
%config IPCompleter.use_jedi = False

In [2]:
import numpy as np

In [17]:
param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64],
    'pca__m_components': [6, 10],
    'logistic__C': np.logspace(-4, 4, 4),
}

In [18]:
import codeflare.pipelines.Datamodel as dm

In [19]:
pipeline_params = dm.PipelineParam.from_param_grid(param_grid)

In [20]:
pipeline_params.__node_name_param_map__

{'pca__0': {'m_components': 6, 'n_components': 5},
 'pca__1': {'m_components': 6, 'n_components': 15},
 'pca__2': {'m_components': 6, 'n_components': 30},
 'pca__3': {'m_components': 6, 'n_components': 45},
 'pca__4': {'m_components': 6, 'n_components': 64},
 'pca__5': {'m_components': 10, 'n_components': 5},
 'pca__6': {'m_components': 10, 'n_components': 15},
 'pca__7': {'m_components': 10, 'n_components': 30},
 'pca__8': {'m_components': 10, 'n_components': 45},
 'pca__9': {'m_components': 10, 'n_components': 64},
 'logistic__0': {'C': 0.0001},
 'logistic__1': {'C': 0.046415888336127774},
 'logistic__2': {'C': 21.54434690031882},
 'logistic__3': {'C': 10000.0}}

In [8]:
from sklearn.model_selection import ParameterGrid

In [9]:
pg = ParameterGrid(param_grid)

In [10]:
for x in pg:
    print(str(x))

{'logistic__C': 0.0001, 'pca__m_components': 6, 'pca__n__components': 5}
{'logistic__C': 0.0001, 'pca__m_components': 6, 'pca__n__components': 15}
{'logistic__C': 0.0001, 'pca__m_components': 6, 'pca__n__components': 30}
{'logistic__C': 0.0001, 'pca__m_components': 6, 'pca__n__components': 45}
{'logistic__C': 0.0001, 'pca__m_components': 6, 'pca__n__components': 64}
{'logistic__C': 0.0001, 'pca__m_components': 10, 'pca__n__components': 5}
{'logistic__C': 0.0001, 'pca__m_components': 10, 'pca__n__components': 15}
{'logistic__C': 0.0001, 'pca__m_components': 10, 'pca__n__components': 30}
{'logistic__C': 0.0001, 'pca__m_components': 10, 'pca__n__components': 45}
{'logistic__C': 0.0001, 'pca__m_components': 10, 'pca__n__components': 64}
{'logistic__C': 0.046415888336127774, 'pca__m_components': 6, 'pca__n__components': 5}
{'logistic__C': 0.046415888336127774, 'pca__m_components': 6, 'pca__n__components': 15}
{'logistic__C': 0.046415888336127774, 'pca__m_components': 6, 'pca__n__components'

In [2]:
import pandas as pd
train = pd.read_csv('../resources/data/train_ctrUa4K.csv')
test = pd.read_csv('../resources/data/test_lAUu6dG.csv')
train = train.drop('Loan_ID', axis=1)
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [3]:
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

numeric_transformer = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler)])

cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
cat_onehot = OneHotEncoder(handle_unknown='ignore')

categorical_transformer = Pipeline(steps=[
    ('imputer', cat_imputer),
    ('onehot', cat_onehot)])

In [5]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
        RandomForestClassifier(),
        GradientBoostingClassifier()
    ]

In [6]:
import ray
ray.shutdown()
ray.init()

2021-06-01 13:39:30,834	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.37',
 'raylet_ip_address': '192.168.1.37',
 'redis_address': '192.168.1.37:6379',
 'object_store_address': '/tmp/ray/session_2021-06-01_13-39-29_310261_97693/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-06-01_13-39-29_310261_97693/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-06-01_13-39-29_310261_97693',
 'metrics_export_port': 58917,
 'node_id': '6cc44d621d9e02fc79a4932617ee8638fcb2d987c7eec90785d75a91'}

In [7]:
import codeflare.pipelines.Datamodel as dm

In [11]:
pipeline = dm.Pipeline()
node_pre = dm.EstimatorNode('preprocess', preprocessor)
node_rf = dm.EstimatorNode('random_forest', classifiers[0])
node_gb = dm.EstimatorNode('gradient_boost', classifiers[1])

pipeline.add_edge(node_pre, node_rf)
pipeline.add_edge(node_pre, node_gb)

In [12]:
import codeflare.pipelines.Runtime as rt
from codeflare.pipelines.Runtime import ExecutionType

In [14]:
pipeline_input = dm.PipelineInput()
xy = dm.Xy(X_train, y_train)
pipeline_input.add_xy_arg(node_pre, xy)

In [15]:
from sklearn.model_selection import KFold, StratifiedKFold

In [16]:
kf = KFold(2)

In [17]:
result = rt.grid_search(kf, pipeline, pipeline_input)

RayTaskError(KeyError): [36mray::split()[39m (pid=97764, ip=192.168.1.37)
  File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
  File "/Users/rganti/PycharmProjects/ray-pipeline/venv/lib/python3.7/site-packages/codeflare_pipelines-1.0.0-py3.7.egg/codeflare/pipelines/Runtime.py", line 222, in split
  File "/Users/rganti/PycharmProjects/ray-pipeline/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 3030, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
  File "/Users/rganti/PycharmProjects/ray-pipeline/venv/lib/python3.7/site-packages/pandas/core/indexing.py", line 1266, in _get_listlike_indexer
    self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
  File "/Users/rganti/PycharmProjects/ray-pipeline/venv/lib/python3.7/site-packages/pandas/core/indexing.py", line 1308, in _validate_read_indexer
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([246, 247, 248, 249, 250, 251, 252, 253, 254, 255,\n            ...\n            481, 482, 483, 484, 485, 486, 487, 488, 489, 490],\n           dtype='int64', length=245)] are in the [columns]"