### Use mlflow to track the parameters for your housing library code. You already have scripts for data preparation, model training, model scoring. Use mlflow to track the parameters and any useful metrics in these scripts. Also, create a main script that runs everything together under a single parent mlflow run-id. Each of the child tasks (i.e. data preparation, model training etc) should get their own mlflow run-id but run as child runs of the main run. See the documentation of the start_run function to see how to create nested runs. Create a PR with your changes and submit it.

In [1]:
# import library
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
%matplotlib inline   
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn

In [2]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000
remote_server_uri = "http://127.0.0.1:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

In [3]:
exp_name = "House_price_prediction"
mlflow.set_experiment(exp_name)

2023/04/05 12:55:09 INFO mlflow.tracking.fluent: Experiment with name 'House_price_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', creation_time=1680679509641, experiment_id='2', last_update_time=1680679509641, lifecycle_stage='active', name='House_price_prediction', tags={}>

### Download data

In [4]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [5]:
#fetch_housing_data()

In [6]:
# create pandas dataframe
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [12]:
with mlflow.start_run(run_name="Hosuse_price_prediction"):
    
    housing=load_housing_data()
    mlflow.log_param("read_data", 1)
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
    
    with mlflow.start_run(run_name="data_prepare",nested=True):
        housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
        housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
        housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
        housing["population_per_household"]=housing["population"]/housing["households"]
        housing1=housing
        housing = housing1.drop("median_house_value", axis=1)
        housing_labels = housing1["median_house_value"].copy()
        housing.drop("income_cat", axis=1,inplace=True)
        mlflow.log_param("data_shape", housing.shape)
        
        
    with mlflow.start_run(run_name="data_cleaning",nested=True):   
        median = housing["total_bedrooms"].median()  # option 3
        housing["total_bedrooms"].fillna(median, inplace=True)
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy="median")
        housing_num = housing.drop("ocean_proximity", axis=1)
        imputer.fit(housing_num)
        imputer.statistics_
        X = imputer.transform(housing_num)
        housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)
        housing_cat = housing[["ocean_proximity"]]
        ## OrdinalEncoder
        from sklearn.preprocessing import OrdinalEncoder
        ordinal_encoder = OrdinalEncoder()
        housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
        from sklearn.preprocessing import OneHotEncoder
        cat_encoder = OneHotEncoder()
        housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
        housing_cat_1hot
        housing_cat_1hot.shape
        
        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import StandardScaler

        num_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy="median")),
                ('std_scaler', StandardScaler()),
            ])

        housing_num_tr = num_pipeline.fit_transform(housing_num)
        
        from sklearn.compose import ColumnTransformer

        num_attribs = list(housing_num)
        cat_attribs = ["ocean_proximity"]

        full_pipeline = ColumnTransformer([
                ("num", num_pipeline, num_attribs),
                ("cat", OneHotEncoder(), cat_attribs),
            ])

        housing_prepared = full_pipeline.fit_transform(housing)
        mlflow.log_param("data_shape", housing_prepared.shape)
        
    with mlflow.start_run(run_name="model_training",nested=True):
        
        # linear Regression
        from sklearn.linear_model import LinearRegression
        lin_reg = LinearRegression()
        lin_reg.fit(housing_prepared, housing_labels)
        from sklearn.tree import DecisionTreeRegressor
        tree_reg = DecisionTreeRegressor()
        tree_reg.fit(housing_prepared, housing_labels)
        housing_predictions = tree_reg.predict(housing_prepared)
        cv=10
        mlflow.log_param("number_of_fold", cv)
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                                 scoring="neg_mean_squared_error", cv=10)
        tree_rmse_scores = np.sqrt(-scores)
        def display_scores(scores):
            print("Scores:", scores)
            print("Mean:", scores.mean())
            print("Standard deviation:", scores.std())

        display_scores(tree_rmse_scores)

        lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                                    scoring="neg_mean_squared_error", cv=10)
        lin_rmse_scores = np.sqrt(-lin_scores)
        mlflow.log_metric("mean_of_mean_squar_error", lin_rmse_scores.mean())
        mlflow.log_metric("std_of_mean_squar_error", lin_rmse_scores.std())

Scores: [118185.0373804   69988.32636693  82703.45099603  74262.4992966
  89608.19120531  81062.90690825  66958.32707689 101452.44849817
  93194.25525259  72014.8909069 ]
Mean: 84943.03338880667
Standard deviation: 15217.794475188668
