# Squirro machine learning service walkthrough

## settings

In [202]:
CLUSTER=''
TOKEN=''
PROJECT_ID=''

## get Squirro client

In [103]:
from squirro_client import SquirroClient
client = SquirroClient(client_id=None, client_secret=None, cluster=CLUSTER)
client.authenticate(refresh_token=TOKEN)

## create machine learning workflow

In [187]:
ml_workflow = {
  "dataset": {
    "train": {"query_string": "dataset:train (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)"},
    "test": {"query_string": "dataset:test (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)"}
  },
  "analyzer": {
    "type": "classification",
    "tag_field": "keywords.pred_label",
    "label_field": "keywords.label"
  },
  "pipeline": [{
    "step": "loader",
    "type": "squirro_query",
    "fields": ["body", "title", "keywords.label"]
  },{
    "step": "filter",
    "type": "empty",
    "fields": ["keywords.label", "body", "title"]
  },{
    "step": "filter",
    "type": "join",
    "input_field": "keywords.label",
    "output_field": "keywords.label"
  },{
    "step": "filter",
    "type": "merge",
    "input_fields": ["body", "title"],
    "output_field": "text"
  },{
    "step": "normalizers",
    "types": ["html", "punctuation", "lowercase", "character"],
    "fields": ["text"]
  },{
    "step": "tokenizer",
    "type": "spaces",
    "fields": ["text"]
  },{
    "step": "embedder",
    "type": "dictionary",
    "batch_size": 1024,
    "input_field": "text",
    "output_field": "indexed_text"
  },{
    "step": "checkpoint",
    "type": "disk",
    "do_randomize": True,
    "batch_size": 1
  },{
    "step": "classifier",
    "type": "cnn_seq2one",
    "batch_size": 1024,
    "dict_name": "dictionary",
    "dropout_fraction": 0.5,
    "embedding_dim": 50,
    "explanation_field": "explanantion",
    "input_field": "indexed_text",
    "label_field": "keywords.label",
    "labels": ["soc.religion.christian", "alt.atheism", "sci.space"],
    "max_sequence_length": 1000,
    "mini_batch_size": 64,
    "n_epochs": 10,
    "output_field": "keywords.pred_label"
  },{
    "step": "saver",
    "type": "squirro_item",
    "batch_size": 1000,
    "fields": ["keywords.pred_label"]
  }]
}

## upload workflow

In [193]:
client.get_machinelearning_workflows(PROJECT_ID)

{u'machinelearning_workflows': []}

In [194]:
ml_workflow_id = client.new_machinelearning_workflow(PROJECT_ID, name='e2e_cnn', config=ml_workflow).get('id')

In [195]:
client.get_machinelearning_workflow(PROJECT_ID, ml_workflow_id=ml_workflow_id)

{u'machinelearning_workflow': {u'config': {u'analyzer': {u'label_field': u'keywords.label',
    u'tag_field': u'keywords.pred_label',
    u'type': u'classification'},
   u'dataset': {u'test': {u'query_string': u'dataset:test (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)'},
    u'train': {u'query_string': u'dataset:train (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)'}},
   u'pipeline': [{u'fields': [u'body', u'title', u'keywords.label'],
     u'step': u'loader',
     u'type': u'squirro_query'},
    {u'fields': [u'keywords.label', u'body', u'title'],
     u'step': u'filter',
     u'type': u'empty'},
    {u'input_field': u'keywords.label',
     u'output_field': u'keywords.label',
     u'step': u'filter',
     u'type': u'join'},
    {u'input_fields': [u'body', u'title'],
     u'output_field': u'text',
     u'step': u'filter',
     u'type': u'merge'},
    {u'fields': [u'text'],
     u'step': u'normalizers',
     u'types': [u'html', u'punctua

## create training job

In [196]:
client.get_machinelearning_jobs(PROJECT_ID, ml_workflow_id=ml_workflow_id)

{u'machinelearning_jobs': []}

In [197]:
training_job_id = client.new_machinelearning_job(PROJECT_ID, ml_workflow_id=ml_workflow_id, type='training').get('id')

In [185]:
client.get_machinelearning_job(PROJECT_ID, ml_workflow_id=ml_workflow_id, ml_job_id=training_job_id)

{u'machinelearning_job': {u'created_at': u'2018-05-23T11:59:36',
  u'error_count': 2,
  u'healthy': False,
  u'id': u'CXy954RlQMOia98GUxUZfA',
  u'last_error': u'`save_model` requires h5py.',
  u'last_error_at': u'2018-05-23T12:15:47',
  u'last_success_at': None,
  u'ml_workflow_id': u'hnbPBVRuQ9G3sRIJiyvTMQ',
  u'modified_at': u'2018-05-23T12:15:47',
  u'next_run_time_at': u'2018-05-23T12:29:39',
  u'total_runs': 2,
  u'type': u'training'}}

## test on unlabeled data

In [201]:
items = client.query(
    PROJECT_ID,
    query="dataset:test (label:sci.space OR label:soc.religion.christian OR label:alt.atheism)",
    count=1
).get('items')

In [None]:
client.run_machinelearning_workflow(PROJECT_ID, ml_workflow_id=ml_workflow_id, items)

## add machine learning workflow to pipelet

## create inference job

In [None]:
client.get_machinelearning_jobs(PROJECT_ID, ml_workflow_id=ml_workflow_id)

In [None]:
inference_job_id = client.new_machinelearning_job(PROJECT_ID, ml_workflow_id=ml_workflow_id, type='inference').get('id')

In [None]:
client.get_machinelearning_job(PROJECT_ID, ml_workflow_id=ml_workflow_id, ml_job_id=inference_job_id)

## reset

In [189]:
client.delete_machinelearning_workflow(PROJECT_ID, ml_workflow_id=ml_workflow_id)

{}

# Local training