Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

# Using AML Pipelines to train a text dataset
### Preprocessing 20 newsgroups text dataset to features and running several models
This example computes numeric features for a text dataset and then runs several models on the resulting features. A step at the end then chooses the best model from the predictions done on the test dataset.

In [None]:
import os
from azureml.core import Workspace, Run, Experiment

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# Also create a Project and attach to Workspace
project_folder = "scripts"
run_history_name = project_folder

if not os.path.isdir(project_folder):
    os.mkdir(project_folder)

In [None]:
from azureml.core.compute import BatchAiCompute, ComputeTarget
from azureml.core.datastore import Datastore
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.compute import DsvmCompute

In [None]:
# Batch AI compute
cluster_name = "cpu-cluster"
try:
    cluster = BatchAiCompute(ws, cluster_name)
    print("found existing cluster.")
except:
    print("creating new cluster")
    provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = "STANDARD_D2_v2",
                                                                    autoscale_enabled = True,
                                                                    cluster_min_nodes = 3, 
                                                                    cluster_max_nodes = 3)

    # create the cluster
    cluster = ComputeTarget.create(ws, cluster_name, provisioning_config)
    cluster.wait_for_completion(show_output=True)

We use the default blob datastore that comes with the workspace. 

In [None]:
default_datastore = ws.get_default_datastore()

In [None]:
%%writefile $project_folder/process_newsgroups.py
import argparse
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer

parser = argparse.ArgumentParser("generate feature hashing features from 20 newsgroups")
parser.add_argument("--out_dir", type=str, help="output train dir")
args = parser.parse_args()

os.makedirs(args.out_dir)

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

X_train, X_test = data_train.data, data_test.data
vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

y_train, y_test = data_train.target, data_test.target

obj = {}
obj["X_train"] = X_train
obj["X_test"] = X_test
obj["y_train"] = y_train
obj["y_test"] = y_test


out_file = os.path.join(args.out_dir, "20news.pkl")
with open(out_file, "wb") as fp:
    pickle.dump(obj, fp)

In [None]:
%%writefile $project_folder/train_lr.py
import argparse
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from azureml.core.run import Run

parser = argparse.ArgumentParser("train logistic regression on input data")
parser.add_argument("--input_dir", type=str, help="input train dir")
parser.add_argument("--output_dir", type=str, help="output dir")

args = parser.parse_args()

with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)

X_train = obj["X_train"]
y_train = obj["y_train"]

X_test = obj["X_test"]
y_test = obj["y_test"]

clf = LogisticRegression()
clf.fit(X_train, y_train)

os.makedirs(args.output_dir, exist_ok=True)
output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)

accuracy = accuracy_score(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("accuracy", accuracy)

In [None]:
%%writefile $project_folder/train_rf.py
import argparse
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from azureml.core.run import Run

parser = argparse.ArgumentParser("train random classifier on input data")
parser.add_argument("--input_dir", type=str, help="input train dir")
parser.add_argument("--output_dir", type=str, help="output dir")

args = parser.parse_args()

with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)

X_train = obj["X_train"]
X_test = obj["X_test"]
y_train = obj["y_train"]
y_test = obj["y_test"]

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

os.makedirs(args.output_dir, exist_ok=True)
output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)
    
accuracy = accuracy_score(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("accuracy", accuracy)

In [None]:
%%writefile $project_folder/train_gbt.py
import argparse
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from azureml.core.run import Run

parser = argparse.ArgumentParser("train random classifier on input data")
parser.add_argument("--input_dir", type=str, help="input train dir")
parser.add_argument("--output_dir", type=str, help="output dir")

args = parser.parse_args()

with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)

X_train = obj["X_train"]
y_train = obj["y_train"]
X_test = obj["X_test"]
y_test = obj["y_test"]

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

os.makedirs(args.output_dir, exist_ok=True)
output_file = os.path.join(args.output_dir, "model.pkl")
with open(output_file, "wb") as fp:
    pickle.dump(clf, fp)
    
accuracy = accuracy_score(y_test, clf.predict(X_test))
run = Run.get_context()
run.log("accuracy", accuracy)

In [None]:
from azureml.core.runconfig import CondaDependencies, RunConfiguration
cd = CondaDependencies.create(conda_packages=['scikit-learn'])
runconfig = RunConfiguration(conda_dependencies=cd)
runconfig.environment.docker.enabled = True

In [None]:
processed_data = PipelineData("processed_data", datastore=default_datastore)
lr_model = PipelineData("lr", datastore=default_datastore)
rf_model = PipelineData("rf", datastore=default_datastore)
gbt_model = PipelineData("gbt", datastore=default_datastore)

In [None]:
data_step = PythonScriptStep(
    name="process 20newsgroups dataset",
    script_name="process_newsgroups.py",
    arguments=["--out_dir", processed_data],
    outputs=[processed_data],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

lr_step = PythonScriptStep(
    name="train lr",
    script_name="train_lr.py",
    arguments=["--input_dir", processed_data, "--output_dir", lr_model],
    inputs=[processed_data],
    outputs=[lr_model],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

rf_step = PythonScriptStep(
    name="train rf model",
    script_name="train_rf.py",
    arguments=["--input_dir", processed_data, "--output_dir", rf_model],
    inputs=[processed_data],
    outputs=[rf_model],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

gbt_step = PythonScriptStep(
    name="train gbt",
    script_name="train_gbt.py",
    arguments=["--input_dir", processed_data, "--output_dir", gbt_model],
    inputs=[processed_data],
    outputs=[gbt_model],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

In [None]:
pipeline = Pipeline(workspace=ws, steps=[lr_step, rf_step, gbt_step])
pipeline.validate()
exp = Experiment(ws, "lr_rf_gbt")
pipeline_run = exp.submit(pipeline)

# Monitor runs using widget

In [None]:
from azureml.train.widgets import RunDetails
RunDetails(pipeline_run).show()

# Get metrics after completion

In [None]:
pipeline_run.wait_for_completion(show_output=True)

In [None]:
for step_run in pipeline_run.get_children():
    print("{}: {}".format(step_run.name, step_run.get_metrics()))

# Clean compute resources

In [None]:
#cluster.delete()