In [1]:
import os
import tarfile
import urllib
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

import mlflow
import mlflow.sklearn

In [2]:
os.getcwd()

'C:\\Users\\sameer.malhotra\\Training_TAMLEP_A1_2\\MLflow'

In [3]:
!virtualenv my_env

created virtual environment CPython3.10.9.final.0-64 in 6646ms
  creator CPython3Windows(dest=C:\Users\sameer.malhotra\Training_TAMLEP_A1_2\MLflow\my_env, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=C:\Users\sameer.malhotra\AppData\Local\pypa\virtualenv)
    added seed packages: pip==23.1.2, setuptools==67.8.0, wheel==0.40.0
  activators BashActivator,BatchActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator


In [4]:
!.\my_env\Scripts\activate

In [14]:
!mlflow ui

^C


In [15]:
server_url = 'http://localhost:5000/'
mlflow.set_tracking_uri(server_url)
#mlflow.autolog() 

In [20]:
mlflow.get_tracking_uri()

'http://localhost:5000'

In [17]:
experiment_name = 'housing_value_exp1'
mlflow.set_experiment(experiment_name)

2023/06/29 16:02:12 INFO mlflow.tracking.fluent: Experiment with name 'housing_value_exp1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/378577081214599708', creation_time=1688034734226, experiment_id='378577081214599708', last_update_time=1688034734226, lifecycle_stage='active', name='housing_value_exp1', tags={}>

In [18]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
    
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]
    

In [19]:

fetch_housing_data()    
housing = pd.read_csv(os.path.join(HOUSING_PATH, "housing.csv"))

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)        

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)


with mlflow.start_run() as Run:

    num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                             ('attribs_adder', CombinedAttributesAdder()),
                             ('std_scaler', StandardScaler())])

    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)])

    housing_prepared = full_pipeline.fit_transform(housing)    

    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)    

    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)


    mlflow.log_params({'imputer':'median'})
    mlflow.log_artifacts(HOUSING_PATH)
    mlflow.log_metrics({"rmse":lin_rmse})
    mlflow.sklearn.log_model(lin_reg, 'model')

    print("Run ID: {}".format(Run.info.run_id))


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run ID: 36dad8ef3a844041b0e0db00004c78a0
