In [42]:
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit,GridSearchCV,RandomizedSearchCV
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy import stats

import mlflow
import mlflow.sklearn

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [24]:

def load_housing_data():
    housing = pd.read_csv('housing.csv')
    return housing

def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def data_prep_function(housing):
    housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
    )

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    housing = strat_train_set.copy()
    housing = strat_train_set.drop(
        "median_house_value", axis=1
        )  # drop labels for training set
    housing_labels = strat_train_set["median_house_value"].copy()
    test_labels = strat_test_set["median_house_value"].copy()

    housing_num = housing.drop("ocean_proximity", axis=1)
    imputer = SimpleImputer(strategy="median")
    imputer.fit(housing_num)
    X = imputer.transform(housing_num)

    housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index)
    housing_tr["rooms_per_household"] = housing_tr["total_rooms"] / housing_tr["households"]
    housing_tr["bedrooms_per_room"] = (
        housing_tr["total_bedrooms"] / housing_tr["total_rooms"]
    )
    housing_tr["population_per_household"] = (
        housing_tr["population"] / housing_tr["households"]
    )

    housing_cat = housing[["ocean_proximity"]]
    housing_prepared = housing_tr.join(pd.get_dummies(housing_cat, drop_first=True))

    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    X_test_num = X_test.drop("ocean_proximity", axis=1)
    X_test_prepared = imputer.transform(X_test_num)
    X_test_prepared = pd.DataFrame(
        X_test_prepared, columns=X_test_num.columns, index=X_test.index
    )
    X_test_prepared["rooms_per_household"] = (
        X_test_prepared["total_rooms"] / X_test_prepared["households"]
    )
    X_test_prepared["bedrooms_per_room"] = (
        X_test_prepared["total_bedrooms"] / X_test_prepared["total_rooms"]
    )
    X_test_prepared["population_per_household"] = (
        X_test_prepared["population"] / X_test_prepared["households"]
    )

    X_test_cat = X_test[["ocean_proximity"]]
    X_test_prepared = X_test_prepared.join(pd.get_dummies(X_test_cat, drop_first=True))
    return housing_prepared, X_test_prepared, housing_labels, test_labels


In [43]:
def train(alpha=0.5, l1_ratio=0.5):
    experiment_id = mlflow.create_experiment("housing_prices_v4")
    with mlflow.start_run(
        run_name="PARENT_RUN",
        experiment_id=experiment_id,
        tags={"version": "v1", "priority": "P1"},
        description="housing_prices_prediction",
    ) as parent_run:
        mlflow.log_param("parent", "yes")
        with mlflow.start_run(
            run_name="data_preparattion",
            experiment_id=experiment_id,
            description="data preparation step",
            nested=True,
        ) as child_run_1:
            housing = load_housing_data()
            housing_prepared, X_test_prepared, housing_labels, test_labels = data_prep_function(housing)
        with mlflow.start_run(
            run_name="modeling",
            experiment_id=experiment_id,
            description="modeling phase",
            nested=True,
        ) as child_run_2:
            lin_reg = ElasticNet(alpha = alpha, l1_ratio = l1_ratio)
            lin_reg.fit(housing_prepared, housing_labels)
            predicted_labels = lin_reg.predict(X_test_prepared)
            rmse, mae, r2 = eval_metrics(test_labels, predicted_labels)
            # Print out metrics
            print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
            print("  RMSE: %s" % rmse)
            print("  MAE: %s" % mae)
            print("  R2: %s" % r2)

            # Log parameter, metrics, and model to MLflow
            mlflow.log_param(key="alpha", value=alpha)
            mlflow.log_param(key="l1_ratio", value=l1_ratio)
            mlflow.log_metric(key="rmse", value=rmse)
            mlflow.log_metrics({"mae": mae, "r2": r2})

            print("Save to: {}".format(mlflow.get_artifact_uri()))
            
            mlflow.sklearn.log_model(lin_reg, "model")


In [45]:
train(0.5, 0.5)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 68703.68065600509
  MAE: 51273.115770932345
  R2: 0.6378315540155282
Save to: file:///home/rishi/mlflow_temp/mlruns/483745647156644020/c2a8ad7d901a4d9aacf599fbf1866771/artifacts
