In [111]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder as OHEncoder
from sklearn.compose import ColumnTransformer

import json
import time

from multiprocessing import Pool

from typing import List as List_t, Dict as Dict_t

In [4]:
pipelines_loc = "/nfs1/dsbox-repo/runs/wade-run/pipelines.jl"

## Read total number of pipelines

In [19]:
%%capture output
!wc -l  "/nfs1/dsbox-repo/runs/wade-run/pipelines.jl"

In [21]:
n_pipelines = int(output.stdout.split(' ')[0])
n_pipelines

217235

## Get Pipelines Information and Store in DataFrame

In [31]:
def process_pipe(pipe_str)-> Dict_t:
    pjsn = json.loads(pipe_str)
    info_list = {'dataset': pjsn['problem'], 'task': pjsn['problem_taskType']} 
    prims = dict([(step['primitive']['python_path'], step['hyperparams'] if 'hyperparams' in step else 1.0) for step in pjsn['steps']])
    
    model = [m['primitives'] for m in pjsn['template']['steps'] if m['name'] == 'model_step']
    model = [m for p in model if isinstance(m, str) else m['primitive']]

In [120]:
def clean_hyper(raw: Dict_t) -> Dict_t:
    for hp in sort(raw.keys()):
        
def extract_pipeline_info(fileloc) -> pd.DataFrame:
#     pipelines = iter()
#     info_list = map(
#         lambda pjsn: 
#             {'dataset': pjsn['problem'],  
#              'task': pjsn['problem_taskType'], 
#              **dict([(
#                  step['primitive']['python_path'], 
#                  step['hyperparams'] if 'hyperparams' in step else 1.0
#              ) for step in pjsn['steps']])
#             },
#         iter_pipe(pipelines_loc)
#     )
    
    model_primitives = map(
        lambda pjsn:
            pjsn['template']['steps']['model_step']
    )
#     for pjsn in iter_pipe(pipelines_loc):
#         entry = {}
#         entry['dataset'] = pjsn['problem']
#         for step in pjsn['steps']:
#             entry[step['primitive']['python_path']] = step['hyperparams'] if 'hyperparams' in step else 1
#         info_list.append(entry)
    return pd.DataFrame(list(info_list)).fillna(0)    

In [139]:
%%time
df_pipes = extract_pipeline_info(pipelines_loc)
df_pipes = df_pipes[df_pipes['task'].isin(['classification', 'regression'])] # ignoring pipelines for tasks other than classification and regression
df_pipes = df_pipes.loc[:,(df_pipes != 0).any()]                             # remove columns that all values are zero (primitives that are not used at all)   

CPU times: user 53.6 s, sys: 2.23 s, total: 55.8 s
Wall time: 42.9 s


In [132]:
df_pipes.shape

(216483, 42)

In [136]:
df_pipes.head()

Unnamed: 0,d3m.primitives.data.CastToType,d3m.primitives.data.ExtractColumnsBySemanticTypes,d3m.primitives.datasets.DatasetToDataFrame,d3m.primitives.dsbox.CleaningFeaturizer,d3m.primitives.dsbox.CorexText,d3m.primitives.dsbox.DataFrameToTensor,d3m.primitives.dsbox.Denormalize,d3m.primitives.dsbox.DoNothing,d3m.primitives.dsbox.Encoder,d3m.primitives.dsbox.IQRScaler,...,d3m.primitives.sklearn_wrap.SKPCA,d3m.primitives.sklearn_wrap.SKRandomForestClassifier,d3m.primitives.sklearn_wrap.SKRandomForestRegressor,d3m.primitives.sklearn_wrap.SKRidge,d3m.primitives.sklearn_wrap.SKSGDClassifier,d3m.primitives.sklearn_wrap.SKSGDRegressor,d3m.primitives.sklearn_wrap.SKSelectFwe,d3m.primitives.sri.baseline.MeanBaseline,dataset,task
0,"{'type_to_cast': {'type': 'VALUE', 'data': 'fl...","{'semantic_types': {'type': 'VALUE', 'data': [...",1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,"{'n_components': {'type': 'VALUE', 'data': 15}}","{'bootstrap': {'type': 'VALUE', 'data': True},...",0,0.0,0,0,0,0.0,LL0_1479_hill_valley,classification
1,"{'type_to_cast': {'type': 'VALUE', 'data': 'fl...","{'semantic_types': {'type': 'VALUE', 'data': [...",1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,"{'n_components': {'type': 'VALUE', 'data': 10}}",0,0,0.0,0,0,0,0.0,LL0_1479_hill_valley,classification
2,"{'type_to_cast': {'type': 'VALUE', 'data': 'fl...","{'semantic_types': {'type': 'VALUE', 'data': [...",1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,"{'n_components': {'type': 'VALUE', 'data': 10}}",0,0,0.0,0,0,0,0.0,LL0_1479_hill_valley,classification
3,"{'type_to_cast': {'type': 'VALUE', 'data': 'fl...","{'semantic_types': {'type': 'VALUE', 'data': [...",1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,"{'n_components': {'type': 'VALUE', 'data': 10}}",0,0,0.0,0,0,0,0.0,LL0_1479_hill_valley,classification
4,"{'type_to_cast': {'type': 'VALUE', 'data': 'fl...","{'semantic_types': {'type': 'VALUE', 'data': [...",1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,"{'n_components': {'type': 'VALUE', 'data': 10}}",0,0,0.0,0,0,0,0.0,LL0_1479_hill_valley,classification


In [138]:
type(df_pipes.iloc[0,0])

str

## Encoding the primitives

We are planning to consider each primitive-hyperparamter pair as a distinct primitive. So in this step we encode each of the columns using a OH encoding. To make sure we do not encode primitives that do not have any hyperparamter (or their hypers not used in our pipelines) we exclude the columns that do not have any hyperparames set in them from encoding.

In [124]:
enc_columns = df_pipes.dtypes[df_pipes.dtypes == np.dtype('O')].index.tolist()
enc_columns.remove('dataset')
enc_columns.remove('task')
enc_columns

['d3m.primitives.data.CastToType',
 'd3m.primitives.data.ExtractColumnsBySemanticTypes',
 'd3m.primitives.dsbox.CorexText',
 'd3m.primitives.sklearn_wrap.SKBernoulliNB',
 'd3m.primitives.sklearn_wrap.SKExtraTreesClassifier',
 'd3m.primitives.sklearn_wrap.SKExtraTreesRegressor',
 'd3m.primitives.sklearn_wrap.SKGenericUnivariateSelect',
 'd3m.primitives.sklearn_wrap.SKGradientBoostingClassifier',
 'd3m.primitives.sklearn_wrap.SKGradientBoostingRegressor',
 'd3m.primitives.sklearn_wrap.SKLinearSVC',
 'd3m.primitives.sklearn_wrap.SKMultinomialNB',
 'd3m.primitives.sklearn_wrap.SKPCA',
 'd3m.primitives.sklearn_wrap.SKRandomForestClassifier',
 'd3m.primitives.sklearn_wrap.SKRandomForestRegressor',
 'd3m.primitives.sklearn_wrap.SKSGDClassifier',
 'd3m.primitives.sklearn_wrap.SKSGDRegressor',
 'd3m.primitives.sklearn_wrap.SKSelectFwe']

In [128]:
df_pipes_casted = df_pipes
df_pipes_casted[enc_columns] = df_pipes[enc_columns].astype(str)

In [130]:
oh_enc = OHEncoder()
oh_enc.fit(df_pipes_casted)
df_enc_pipes = oh_enc.transform(df_pipes_casted)


AttributeError: head not found

In [131]:
df_enc_pipes

<216483x2987 sparse matrix of type '<class 'numpy.float64'>'
	with 9092286 stored elements in Compressed Sparse Row format>

In [143]:
!head -n 10 "/nfs1/dsbox-repo/runs/wade-run/pipelines.jl"

{"template_name": "default_classification_template", "problem_taskSubType": "binary", "total_time_used_without_cache": 63.40709710121155, "problem": "LL0_1479_hill_valley", "parent_id": "9262133f-54d4-4b8f-a240-b37e703517c1", "context": "PRETRAINING", "inputs": [{"name": "input dataset"}], "total_time_used_with_cache": 17.110462427139282, "dataset": "ll0", "dataset_id": "LL0_1479_hill_valley_dataset_fd2d0189-dfd9-45d4-84db-a82f201a82c4", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "name": "dsbox_140345879754072", "outputs": [{"name": "predictions of input dataset", "data": "steps.12.produce"}], "created": "2018-08-10T10:21:51.399027Z", "id": "88f83d96-a668-452d-976e-8eb529c41743", "baseline": 0.8666666666666667, "beat_baseline": false, "metric": "f1Macro", "template": {"name": "default_classification_template", "target": "extract_target_step", "taskSubtype": ["BINARY", "MULTICLASS"], "taskType": ["CLASSIFICATION"], "steps": [{"name": "denormalize_step