Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

# Use AML Pipelines to train multiple featurizers
### Multiple features on the same dataset, concatenate, and train
Using the 20newsgroups dataset as an example, we will first compute features on the same dataset using two different featurizers. For this demo we use the same machine, however in cases with some featurizers being more expensive than others and large datasets it might make sense to split this to different machines.
Eventually, features from each of these are concatenated and used to train a `sklearn` `Pipeline`.

In [None]:
import os
from azureml.core import Workspace, Run, Experiment

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# Also create a Project and attach to Workspace
project_folder = "scripts"
run_history_name = project_folder

if not os.path.isdir(project_folder):
    os.mkdir(project_folder)

In [None]:
from azureml.core.compute import BatchAiCompute, ComputeTarget
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core.datastore import Datastore
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.compute import DsvmCompute

In [None]:
# Batch AI compute
cluster_name = "cpu-cluster"
try:
    cluster = BatchAiCompute(ws, cluster_name)
    print("found existing cluster.")
except:
    print("creating new cluster")
    provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = "STANDARD_D2_v2",
                                                                    autoscale_enabled = True,
                                                                    cluster_min_nodes = 3, 
                                                                    cluster_max_nodes = 3)

    # create the cluster
    cluster = ComputeTarget.create(ws, cluster_name, provisioning_config)
    cluster.wait_for_completion(show_output=True)

We use the default blob datastore that comes with the workspace. 

In [None]:
default_datastore = ws.get_default_datastore()

# Python scripts
- `fetch_newsgroups.py`: Fetch 20newsgroups data
- `hashing_features.py`: Use feature hashing to generate features
- `tfidf_features.py`: Compute tfidf features
- `train_model.py`: Concatenate and train logistic regression model

In [None]:
%%writefile $project_folder/fetch_newsgroups.py
import argparse
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer

parser = argparse.ArgumentParser("download 20 newsgroups dataset")
parser.add_argument("--out_dir", type=str, help="output data dir")

args = parser.parse_args()

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

obj = {}
obj["data_train"] = data_train
obj["data_test"] = data_test

os.makedirs(args.out_dir)

with open(os.path.join(args.out_dir, "20news.pkl"), "wb") as fp:
    pickle.dump(obj, fp)

In [None]:
%%writefile $project_folder/hashing_features.py
import argparse
import pickle
from sklearn.feature_extraction.text import HashingVectorizer

parser = argparse.ArgumentParser("generate feature hashing features for 20 newsgroups")
parser.add_argument("--input_dir", type=str, help="data directory")
parser.add_argument("--out_dir", type=str, help="output feature hashing features directory")

args = parser.parse_args()

vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)

with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)
    
data_train = obj["data_train"]
    
X_train = vectorizer.fit_transform(data_train.data)

obj = {}
obj["X_train"] = X_train
obj["vectorizer"] = vectorizer

os.makedirs(args.out_dir)

with open(os.path.join(args.out_dir, "feats.pkl"), "wb") as fp:
    pickle.dump(obj, fp)

In [None]:
%%writefile $project_folder/tfidf_features.py
import argparse
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

parser = argparse.ArgumentParser("generate feature hashing features for 20 newsgroups")
parser.add_argument("--input_dir", type=str, help="data directory")
parser.add_argument("--out_dir", type=str, help="output tfidf features directory")
parser.add_argument("--ngram", type=int, help="character ngram length")
args = parser.parse_args()

vectorizer = TfidfVectorizer(ngram_range=(args.ngram, args.ngram), analyzer="char")

with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)
    
data_train = obj["data_train"]

X_train = vectorizer.fit_transform(data_train.data)

obj = {}
obj["X_train"] = X_train
obj["vectorizer"] = vectorizer

os.makedirs(args.out_dir)
with open(os.path.join(args.out_dir, "feats.pkl"), "wb") as fp:
    pickle.dump(obj, fp)

In [None]:
%%writefile $project_folder/train_model.py
import argparse
import os
import pickle
from scipy import sparse
import sklearn
from sklearn.linear_model import LogisticRegression
import sklearn.pipeline
from sklearn.metrics import roc_auc_score
from azureml.core.run import Run

parser = argparse.ArgumentParser("train model for 20 newsgroups")
parser.add_argument("--hashing_dir", type=str, help="feature hashing directory")
parser.add_argument("--tfidf_dir", type=str, help="tfidf features directory")
parser.add_argument("--input_dir", type=str, help="data directory")
parser.add_argument("--output_dir", type=str, help="output model dir")
args = parser.parse_args()

vectorizers = []
X_train = []

with open(os.path.join(args.hashing_dir, "feats.pkl"), "rb") as fp:
    obj = pickle.load(fp)
    vectorizers.append(("feature_hashing", obj["vectorizer"]))
    X_train.append(obj["X_train"])
    
with open(os.path.join(args.tfidf_dir, "feats.pkl"), "rb") as fp:
    obj = pickle.load(fp)
    vectorizers.append(("tfidf_features", obj["vectorizer"]))
    X_train.append(obj["X_train"])
    
with open(os.path.join(args.input_dir, "20news.pkl"), "rb") as fp:
    obj = pickle.load(fp)
    y_train = obj["data_train"].target
    y_test = obj["data_test"].target
    raw_X_test = obj["data_test"].data
    
X_train = sparse.hstack(X_train)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

final_model = sklearn.pipeline.Pipeline([("transformer", 
                                          sklearn.pipeline.FeatureUnion(vectorizers)), 
                                         ("model", lr_model)])

# check performance of final model
pred_probs = final_model.predict_proba(raw_X_test)

# binarize labels to compute average auc
binarizer = sklearn.preprocessing.LabelBinarizer()
binarizer.fit(y_train)
y_test_bin = binarizer.transform(y_test)
auc = roc_auc_score(y_test_bin, pred_probs)
print(f"Current AUC: {auc}")

run = Run.get_context()
run.log("auc", auc)

os.makedirs(args.output_dir, exist_ok=True)
out_file = os.path.join(args.output_dir, "model.pkl")
with open(out_file, "wb") as fp:
    pickle.dump(final_model, fp)

# Define runconfig environment in the dsvm

In [None]:
from azureml.core.runconfig import CondaDependencies, RunConfiguration
cd = CondaDependencies.create(conda_packages=['scikit-learn'])
runconfig = RunConfiguration(conda_dependencies=cd)
runconfig.environment.docker.enabled = True

# PipelineData where the code is written to and read from

In [None]:
raw_data = PipelineData("rawdata", datastore=default_datastore)
hashing_features = PipelineData("hashing", datastore=default_datastore)
tfidf_features = PipelineData("tfidf", datastore=default_datastore)
output_dir = PipelineData("model_output", datastore=default_datastore)

# Define steps and run

In [None]:
data_step = PythonScriptStep(
    name="fetch 20newsgroups dataset",
    script_name="fetch_newsgroups.py",
    arguments=["--out_dir", raw_data],
    outputs=[raw_data],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

feature_hashing_step = PythonScriptStep(
    name="feature hashing",
    script_name="hashing_features.py",
    arguments=["--input_dir", raw_data, "--out_dir", hashing_features],
    inputs=[raw_data],
    outputs=[hashing_features],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

tfidf_step = PythonScriptStep(
    name="tfidf",
    script_name="tfidf_features.py",
    arguments=["--input_dir", raw_data, "--out_dir", tfidf_features, "--ngram", 3],
    inputs=[raw_data],
    outputs=[tfidf_features],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

model_step = PythonScriptStep(
    name="train the final model",
    script_name="train_model.py",
    arguments=["--input_dir", raw_data,
               "--hashing_dir", hashing_features,
               "--tfidf_dir", tfidf_features,
               "--output_dir", output_dir
              ],
    inputs=[raw_data, hashing_features, tfidf_features],
    outputs=[output_dir],
    source_directory=project_folder,
    runconfig=runconfig,
    target=cluster
)

In [None]:
pipeline = Pipeline(workspace=ws, steps=[model_step])
pipeline.validate()
pipeline_run = Experiment(ws, "train_model_20newsgroups").submit(pipeline)

# Monitor runs using widget

In [None]:
from azureml.train.widgets import RunDetails
RunDetails(pipeline_run).show()

# Complete run and print metrics

In [None]:
pipeline_run.wait_for_completion()
for step_run in pipeline_run.get_children():
    print("{}: {}".format(step_run.name, step_run.get_metrics()))

# Optionally Clean compute resources

In [None]:
#cluster.delete()