In [None]:
import pandas as pd
import os
import joblib
from azureml.core import Workspace, Experiment, Dataset, Datastore, Model, Run

In [None]:
### train ML model  -- code from before
### --------------------

df = pd.read_csv("./data/german_credit_dataset.csv").drop('Sno', axis = 1)

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

y_raw = df['Risk']
X_raw = df.drop('Risk', axis=1)

categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

feature_engineering_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ], remainder="drop")

# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.25, stratify=encoded_y, random_state=42)

# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
                         ('classifier', LogisticRegression(solver="lbfgs", random_state = 23, penalty='l2'))])

In [None]:
### connecting to ML workspace
### --------------------

ws = Workspace.from_config()

In [None]:
### log experiment
### --------------------

experiment_name = 'german_credit_hsg'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

# Train the model
lr_clf.fit(X_train, y_train)

# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
test_acc = lr_clf.score(X_test, y_test)
print("Training accuracy: %.3f" % train_acc)
print("Test data accuracy: %.3f" % test_acc)
print("Recall for class 'Bad': ", recall_score(y_test, lr_clf.predict(X_test), pos_label=0))

# Log to Azure ML
run.log('Train accuracy', train_acc)
run.log('Test accuracy', test_acc)
run.log('Recall', recall_score(y_test, lr_clf.predict(X_test), pos_label=0))
    
run.complete()

In [None]:
### store & upload model
### --------------------

#os.remove('model.pkl')
joblib.dump(value=lr_clf, filename='model.pkl')
run.upload_file(name = 'model.pkl', path_or_stream = './model.pkl')

In [None]:
### register model
### --------------------

ds = Dataset.get_by_name(ws, 'german_credit_hsg')

model = run.register_model(model_name='german-credit-hsg',
                           model_path='model.pkl',
                           datasets=[['training-dataset', ds]],
                           tags={"use": "demo", 'recall': 0.386})