In [None]:

import os
import argparse
import pickle
import pandas as pd
from azureml.core import Dataset, Run
import numpy as np
from sklearn.metrics import accuracy_score #metrics
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

# sklearn.externals.joblib is removed in 0.23
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.23.0"):
    from sklearn.externals import joblib
else:
    import joblib
    
run = Run.get_context()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--learning_rate', type=float, default=0.2,
                        help='learning_rate parameter to be used in the algorithm')
    parser.add_argument('--n_estimators', type=int, default=100,
                        help='n_estimators to be used in the algorithm')
    parser.add_argument('--max_depth', type=int, default=3,
                        help='max_depth parameter to be used in the algorithm')
    parser.add_argument('--min_samples_split', type=int, default=100,
                        help='min_samples_split to be used in the algorithm')
    parser.add_argument('--min_samples_leaf', type=int, default=100,
                        help='min_samples_leaf to be used in the algorithm')
    parser.add_argument('--subsample', type=float, default=3,
                        help='subsample parameter to be used in the algorithm')
    parser.add_argument('--random_state', type=int, default=0.7,
                        help='random_state to be used in the algorithm')
    parser.add_argument('--max_features', type=int, default=0.0,
                        help='max_features parameter to be used in the algorithm')
    

    args = parser.parse_args()  
    run.log('learning_rate', np.float(args.learning_rate))
    run.log('n_estimators', np.int(args.n_estimators))
    run.log('max_depth', np.int(args.max_depth))
    run.log('min_samples_split', np.int(args.min_samples_split))
    run.log('min_samples_leaf', np.int(args.min_samples_leaf))
    run.log('subsample', np.float(args.subsample))
    run.log('random_state', np.int(args.subsample))
    run.log('max_features', np.int(args.subsample))


    # get input dataset by name
    bank_dataset = run.input_datasets['bank_dataset']
    data = bank_dataset.to_pandas_dataframe()

    
    # Data Cleaning
    cat_col = ['default', 'housing', 'loan', 'deposit', 'job', 
                'marital', 'education', 'contact', 'month', 'poutcome']
    for column in cat_col:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(data[column])
        label_encoded_y = label_encoder.transform(data[column])
        data[column + '_cat'] = label_encoded_y
    #     data[column + '_bool'] = data[column].eq('yes').mul(1)
    data = data.drop(columns = cat_col)
    
    #drop irrelevant columns
    data = data.drop(columns = ['pdays'])
    #impute incorrect values and drop original columns
    def get_correct_values(row, column_name, threshold, df):
        ''' Returns mean value if value in column_name is above threshold'''
        if row[column_name] <= threshold:
            return row[column_name]
        else:
            mean = df[df[column_name] <= threshold][column_name].mean()
            return mean
    data['campaign_cleaned'] = data.apply(lambda row: get_correct_values(row, 'campaign', 50, data),axis=1)
    data['previous_cleaned'] = data.apply(lambda row: get_correct_values(row, 'previous', 50, data),axis=1)
    data = data.drop(columns = ['campaign', 'previous'])


    # Model Training
    X = data.drop(columns = 'deposit_cat')
    y = data[['deposit_cat']]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
    Params = {'learning_rate': np.float(args.learning_rate),
              'n_estimators': np.int(args.n_estimators),
              'max_depth': np.int(args.max_depth),
              'min_samples_split': np.int(args.min_samples_split),
              'min_samples_leaf': np.int(args.min_samples_leaf),
              'subsample': np.float(args.subsample),
              'random_state': np.int(args.random_state),
              'max_features': np.int(args.max_features)}
        
    # GradientBoostingClassifier
    clf = GradientBoostingClassifier(**Params)
    clf.fit(X_train,y_train.squeeze().values)
    
    #calculate and print scores for the model 
    y_train_preds = clf.predict(X_train)
    y_test_preds = clf.predict(X_test)


    model_file_name = 'joblibGB_bankmarketing.sav'

    accuracy_score_train = accuracy_score(y_train, y_train_preds)
    accuracy_score_test = accuracy_score(y_test, y_test_preds)
    run.log('Gradient Boosting Accuracy Score for training', accuracy_score_train)
    run.log('Graident Boosting Accuracy Score for testing', accuracy_score_test)

    # Save the trained model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=clf, filename='outputs/' + model_file_name)    

if __name__ == '__main__':
    main()


In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore
from azureml.widgets import RunDetails
 
from azureml.core import Dataset
 
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference
from azureml.pipeline.steps import PythonScriptStep
 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
 
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
from azureml.core.model import Model
 
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.12.0


In [2]:
ws = Workspace.from_config()

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
aml_compute = "ninjacpucluster"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=aml_compute)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, aml_compute, compute_config)

aml_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:0.2.1"

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-dataset-runtime[fuse]', 'packaging', 'numpy==1.16.2'])

print ("Run configuration created.")

Run configuration created.


In [6]:
blob_datastore = Datastore.get(ws, "kaggledatabook")

In [None]:
from azureml.core.dataset import Dataset
bank_dataset = Dataset.Tabular.from_delimited_files(path=blob_datastore.path('Bank.csv'))

In [8]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './scripts/prepdata'

# Define output after cleansing step
cleansed_data = PipelineData("cleansed_data", datastore=blob_datastore).as_dataset()

print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
cleansingStep = PythonScriptStep(
    name="Cleanse Bank Marketing Data",
    script_name="prep.py", 
    arguments=["--output_cleanse", cleansed_data],
    inputs=[bank_dataset.as_named_input('bank_dataset')],
    outputs=[cleansed_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("cleansingStep created.")


Cleanse script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/adscompute/code/Users/prsing/BankMarketingAnalysis/ML_Pipelines/scripts/prepdata.
cleansingStep created.


In [None]:
train_model_folder = './scripts/trainmodel'
trainmodel = PythonScriptStep(name="train_step",
                         script_name="./train.py", 
                         arguments=["--train", train_data,"--test", test_data,"--model",model_file],
                         inputs= [cleansed_data.parse_parquet_files(file_extension=None)],
                         outputs=[model_file],                         
                         compute_target=aml_compute, 
                         runconfig=aml_run_config,
                         source_directory=train_model_folder,
                         allow_reuse=True)

In [61]:
import pandas as pd

cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
        'Price': [22000,25000,27000,35000],
        'check': [22, 33,44, 66]
        }

df = pd.DataFrame(cars, columns = ['Brand', 'Price', 'check'])

df.head()

Unnamed: 0,Brand,Price,check
0,Honda Civic,22000,22
1,Toyota Corolla,25000,33
2,Ford Focus,27000,44
3,Audi A4,35000,66


In [58]:
array = df.values
array

array([['Honda Civic', 22000, 22],
       ['Toyota Corolla', 25000, 33],
       ['Ford Focus', 27000, 44],
       ['Audi A4', 35000, 66]], dtype=object)

In [59]:
 pd.DataFrame(array)

Unnamed: 0,0,1,2
0,Honda Civic,22000,22
1,Toyota Corolla,25000,33
2,Ford Focus,27000,44
3,Audi A4,35000,66


In [60]:
import numpy as np
import argparse
import os
from azureml.core import Run
 
from pandas import read_csv
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# import joblib 

In [62]:

# Model Training
# X = data.drop(columns = 'deposit_cat')
X = df.drop(columns = 'Price').values
# y = df['Price'].values
y = df['Price'].values
X

array([['Honda Civic', 22],
       ['Toyota Corolla', 33],
       ['Ford Focus', 44],
       ['Audi A4', 66]], dtype=object)

In [63]:
 
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
random_state=7)

In [67]:
train=np.column_stack((X_train,Y_train))
test=np.column_stack((X_test,Y_test))

In [68]:
train

array([['Toyota Corolla', 33, 25000],
       ['Honda Civic', 22, 22000],
       ['Audi A4', 66, 35000]], dtype=object)

In [73]:
np.savetxt("train.txt",train)
# np.savetxt(args.test+"/test.txt",test,fmt="%f")

TypeError: Mismatch between array dtype ('object') and format specifier ('%.18e %.18e %.18e')