#### AutoMl Pipeline Implementation

In [1]:
import mlflow
import mlflow.sklearn
import numpy as np
import os
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import StratifiedShuffleSplit
from urllib.parse import urlparse
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import tarfile
import pandas as pd
from scipy.stats import randint
from six.moves import urllib
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedShuffleSplit,
    train_test_split,
)
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
%matplotlib inline  
from zlib import crc32
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_val_score
from scipy import stats

In [2]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    # Two Important method that should be there are fit and transform
    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[
                X, rooms_per_household, population_per_household, bedrooms_per_room
            ]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [3]:
def load_data():
    housing = pd.read_csv("housing.csv")
    return housing

In [4]:
def eval_metrics(actual, pred):
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2, mse

In [5]:
def run(model,experiment_name):
    
    # Create nested runs
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    experiment_id = mlflow.create_experiment(experiment_name)
    with mlflow.start_run(run_name='Housing MV Prediction',
                          description="parent",experiment_id=experiment_id,tags={"version": "v1", "priority": "P1"}) as parent_run:
        mlflow.log_param("parent", "yes")
        housing = load_data()
        housing["income_cat"] = pd.cut(
            housing["median_income"],
            bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
            labels=[1, 2, 3, 4, 5],
        )

        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        for train_index, test_index in split.split(housing, housing["income_cat"]):
            strat_train_set = housing.loc[train_index]
            strat_test_set = housing.loc[test_index]
        
        housing = strat_train_set.copy()
        housing = strat_train_set.drop("median_house_value", axis=1)
        housing_labels = strat_train_set["median_house_value"].copy()

        housing_test = strat_test_set.copy()
        housing_test = strat_test_set.drop("median_house_value", axis=1)
        housing_test_label = strat_test_set["median_house_value"].copy()

        # Data Preparation
        with mlflow.start_run(run_name='Data Preparation',experiment_id=experiment_id, nested=True) as child_run:
            mlflow.log_param("child", "yes")

            housing_num = housing.drop("ocean_proximity", axis=1)
            
            num_attribs = list(housing_num)
            cat_attribs = ["ocean_proximity"]

            num_pipeline = Pipeline(
                [
                    (
                        "imputer",
                        SimpleImputer(strategy="median"),
                    ),  # 1st replace all null data with median
                    (
                        "attribs_adder",
                        CombinedAttributesAdder(),
                    ),  # 2nd add household per room column
                    (
                        "std_scaler",
                        StandardScaler(),
                    ),  # Standardise data for feature scaling. This must be a transformer.
                ]
            )

            full_pipeline = ColumnTransformer(
                [
                    ("num", num_pipeline, num_attribs),
                    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_attribs),
                ]
            )

            housing_prepared = full_pipeline.fit_transform(housing)


        # Model Training
        with mlflow.start_run(run_name='Model Training',experiment_id=experiment_id,nested=True) as child_run:
            mlflow.log_param("child", "yes")
            print()
            
            model.fit(housing_prepared, housing_labels)

            housing_test_prepared = full_pipeline.fit_transform(housing_test)
            housing_predictions = model.predict(housing_test_prepared)


        # Model Scoring
        with mlflow.start_run(run_name='Model Scoring',experiment_id=experiment_id, nested=True) as child_run:
            mlflow.log_param("child", "yes")

            (rmse, mae, r2, mse) = eval_metrics(housing_test_label, housing_predictions)
            print(type(model).__name__)
            print("  RMSE: %s" % rmse)
            print("  MAE: %s" % mae)
            print("  R2: %s" % r2)
            print(" MSE: %s" %mse)

            mlflow.log_metric(key="rmse", value=rmse)
            mlflow.log_metrics({"mae": mae, "r2": r2, "mse":mse})
            
            print("Save to: {}".format(mlflow.get_artifact_uri()))
            
            mlflow.sklearn.log_model(model, "model")
            
            #Printing the Parent and Child Information
            print("parent run:")
            print("run_id: {}".format(parent_run.info.run_id))
            print("--")
            # Search all child runs with a parent id
            query = "tags.mlflow.parentRunId = '{}'".format(parent_run.info.run_id)
            results = mlflow.search_runs(experiment_ids=[experiment_id], filter_string=query)
            print("child runs:")
            print(results[["run_id", "params.child", "tags.mlflow.runName"]])


In [6]:
run(LinearRegression(),'Lin_Reg')


LinearRegression
  RMSE: 66795.11756711759
  MAE: 49467.926453818494
  R2: 0.6576738793109655
 MSE: 4461587730.80506
Save to: file:///home/tiger03083/my_project/mle-training-1/Assignment_3.2/mlruns/985764118046855869/531c31e977e2479690130d50e0772f08/artifacts
parent run:
run_id: 7f1dc13562c34e35a784c3b811e107f7
--
child runs:
                             run_id params.child tags.mlflow.runName
0  531c31e977e2479690130d50e0772f08          yes       Model Scoring
1  9bb6d7457a7b45768b5eca61dd6834db          yes      Model Training
2  ddaea31232ad4841abdf3eca29cc40a0          yes    Data Preparation


In [7]:
run(DecisionTreeRegressor(),'Dec_Tree')


DecisionTreeRegressor
  RMSE: 110436.17442892361
  MAE: 76098.31928294574
  R2: 0.06422096858947435
 MSE: 12196148622.49564
Save to: file:///home/tiger03083/my_project/mle-training-1/Assignment_3.2/mlruns/272432203101498148/a86dbfcbb45a4670ae9c1916c154358a/artifacts
parent run:
run_id: 27250e3d7a1a429da2f5422434be4912
--
child runs:
                             run_id params.child tags.mlflow.runName
0  a86dbfcbb45a4670ae9c1916c154358a          yes       Model Scoring
1  0db63448128d49c18961a3bcb4735143          yes      Model Training
2  a7d7582d08fd47f3842c66bcbea5d1f3          yes    Data Preparation


In [8]:
import mlflow

# Create nested runs
experiment_id = mlflow.create_experiment("experiment1")
with mlflow.start_run(
    run_name="PARENT_RUN",
    experiment_id=experiment_id,
    tags={"version": "v1", "priority": "P1"},
    description="parent",
) as parent_run:
    mlflow.log_param("parent", "yes")
    with mlflow.start_run(
        run_name="CHILD_RUN",
        experiment_id=experiment_id,
        description="child",
        nested=True,
    ) as child_run:
        mlflow.log_param("child", "yes")

print("parent run:")

print("run_id: {}".format(parent_run.info.run_id))
print("description: {}".format(parent_run.data.tags.get("mlflow.note.content")))
print("version tag value: {}".format(parent_run.data.tags.get("version")))
print("priority tag value: {}".format(parent_run.data.tags.get("priority")))
print("--")

# Search all child runs with a parent id
query = "tags.mlflow.parentRunId = '{}'".format(parent_run.info.run_id)
results = mlflow.search_runs(experiment_ids=[experiment_id], filter_string=query)
print("child runs:")
print(results[["run_id", "params.child", "tags.mlflow.runName"]])

parent run:
run_id: 16a78a1014f148ffad0d2a0f71be7317
description: parent
version tag value: v1
priority tag value: P1
--
child runs:
                             run_id params.child tags.mlflow.runName
0  19fac90c6e774228ba2bd401d156dcf3          yes           CHILD_RUN
