In [2]:
import mlflow
import os


# see the named configure in aws configure
os.environ["AWS_PROFILE"] = "profilename" 

TRACKING_SERVER_HOST = "Public IPv4 DNS" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

First, run command `mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri sqlite:///mlflow_uk_house.db --default-artifact-root s3://mlopszoomcamp-bucket`

In [None]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [4]:
mlflow.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='prefect-forecasting', tags={}>,
 <Experiment: artifact_location='s3://mlopszoomcamp-bucket/', experiment_id='4', lifecycle_stage='active', name='forecasting', tags={}>]

In [5]:
!aws s3 ls

2022-05-28 13:26:21 mlopszoomcamp-bucket


In [6]:
mlflow.get_artifact_uri()

'./mlruns/0/cc43081781244f61a677a007fcbe3971/artifacts'

List of places in UK

In [52]:
import pandas as pd
from pathlib import Path

col1 = ["Average_Price", "Average_Price_SA"]
col2 = ["Monthly_Change", "Annual_Change"]
NEW_PATH = os.path.join(Path.cwd().parents[3],"data","uk_house_price")

print(NEW_PATH)

/home/ubuntu/data/uk_house_price


In [53]:
DATA_PATH = os.path.join(NEW_PATH, 'Average_price-2022-02_from2000.csv')
df = pd.read_csv(DATA_PATH)
df2 = df.drop("Unnamed: 0", axis=1)
df2["Date"] = pd.to_datetime(df2.Date)
df2[col1] = df2[col1].astype("float32")
df2[col2] = df2[col2].astype("float16")

In [55]:
sorted(df2['Region_Name'].unique())

['Aberdeenshire',
 'Adur',
 'Allerdale',
 'Amber Valley',
 'Angus',
 'Antrim and Newtownabbey',
 'Ards and North Down',
 'Argyll and Bute',
 'Armagh City Banbridge and Craigavon',
 'Arun',
 'Ashfield',
 'Ashford',
 'Babergh',
 'Barking and Dagenham',
 'Barnet',
 'Barnsley',
 'Barrow-in-Furness',
 'Basildon',
 'Basingstoke and Deane',
 'Bassetlaw',
 'Bath and North East Somerset',
 'Bedford',
 'Belfast',
 'Bexley',
 'Birmingham',
 'Blaby',
 'Blackburn with Darwen',
 'Blackpool',
 'Blaenau Gwent',
 'Bolsover',
 'Bolton',
 'Boston',
 'Bournemouth Christchurch and Poole',
 'Bracknell Forest',
 'Bradford',
 'Braintree',
 'Breckland',
 'Brent',
 'Brentwood',
 'Bridgend',
 'Brighton and Hove',
 'Broadland',
 'Bromley',
 'Bromsgrove',
 'Broxbourne',
 'Broxtowe',
 'Buckinghamshire',
 'Burnley',
 'Bury',
 'Caerphilly',
 'Calderdale',
 'Cambridge',
 'Cambridgeshire',
 'Camden',
 'Cannock Chase',
 'Canterbury',
 'Cardiff',
 'Carlisle',
 'Carmarthenshire',
 'Castle Point',
 'Causeway Coast and Glen

In [7]:
import os, datetime
import pandas as pd
import numpy as np
from prophet import Prophet
import statsmodels.api as sm

from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import mlflow
import prefect
from prefect import task, flow, get_run_logger
from prefect.task_runners import SequentialTaskRunner

In [8]:
def get_paths() -> str:
    PATH_CURRENT = Path.cwd()
    NEW_PATH = os.path.join(PATH_CURRENT.parents[3], "data", "uk_house_price")
    DATA_PATH = os.path.join(NEW_PATH, "Average_price-2022-02_from2000.csv")
    return DATA_PATH

def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df2 = df.drop("Unnamed: 0", axis=1)
    df2["Date"] = pd.to_datetime(df2.Date)
    col1 = ["Average_Price", "Average_Price_SA"]
    col2 = ["Monthly_Change", "Annual_Change"]
    df2[col1] = df2[col1].astype("float32")
    df2[col2] = df2[col2].astype("float16")
    return df2

def data_split(df: pd.DataFrame, region_input: str):
    df = df[df["Region_Name"] == region_input]
    split_date = "2018-01-01"
    df_train = df.loc[df["Date"] <= split_date].copy()
    df_test = df.loc[df["Date"] > split_date].copy()
    df_train = df_train[["Date", "Average_Price"]].rename(
        columns={"Date": "ds", "Average_Price": "y"}
    )
    df_test = df_test[["Date", "Average_Price"]].rename(
        columns={"Date": "ds", "Average_Price": "y"}
    )

    return df_train, df_test

def train_data(df_train: pd.DataFrame, region_input: str):

    model = Prophet()
    model.fit(df_train)

    y_forecast = model.predict(df_train)

    y_pred_low, y_pred_up = max(y_forecast["yhat_lower"]), max(y_forecast["yhat_upper"])

    y_pred_additive, y_pred_low_additive, y_pred_up_additive = (
        max(y_forecast["additive_terms"]),
        max(y_forecast["additive_terms_lower"]),
        max(y_forecast["additive_terms_upper"]),
    )
    
    return model

def evaluation(df_test: pd.DataFrame, model):

    y_predict = model.predict(df_test)
    y_pred_low, y_pred_up = max(y_predict["yhat_lower"]), max(y_predict["yhat_upper"])

    y_pred_additive, y_pred_low_additive, y_pred_up_additive = (
        max(y_predict["additive_terms"]),
       max(y_predict["additive_terms_lower"]),
        max(y_predict["additive_terms_upper"]),
    )

    return y_predict

In [15]:
s3_bucket = "s3://mlopszoomcamp-bucket/UK_house_price/"

In [16]:
mlflow.set_tracking_uri("sqlite:///mlflow_uk_house.db")
mlflow.create_experiment("UK_forecasting", s3_bucket)
mlflow.set_experiment("UK_forecasting")

data_path = get_paths()
df = read_data(data_path)

In [48]:
with mlflow.start_run(run_name="Prophet_forecasting"):

    mlflow.set_tag("Region", "Leeds")
    newcastle_train, newcastle_test = data_split(df, "Leeds")
    model = train_data(newcastle_train, "Leeds")
    y_predict = evaluation(newcastle_test, model)
    mlflow.log_metric("yhat_lower", max(y_predict["yhat_lower"]))
    mlflow.log_metric("yhat_upper", max(y_predict["yhat_upper"]))

    mlflow.log_metric("additive_terms", max(y_predict["additive_terms"]))
    mlflow.log_metric("additive_terms_lower", max(y_predict["additive_terms_lower"]))
    mlflow.log_metric("additive_terms_upper", max(y_predict["additive_terms_upper"]))


    mlflow.prophet.log_model(model, artifact_path="models_prophet")
    print(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")

mlflow.end_run()

13:12:52 - cmdstanpy - INFO - Chain [1] start processing
13:12:52 - cmdstanpy - INFO - Chain [1] done processing


Default artifacts URI: 's3://mlopszoomcamp-bucket/UK_house_price/63e642bf3ad44eb090c2d59f896a3165/artifacts'


## Download artifact with client

In [24]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URi = "http://ec2-35-177-90-140.eu-west-2.compute.amazonaws.com:5000"
RUN_ID = "67d71c7dec044fa8a64bf8e4a0b536dc"

client = MlflowClient(tracking_uri=f"http://{TRACKING_SERVER_HOST}:5000")

In [26]:
client.download_artifacts(run_id=RUN_ID, path='')

'/tmp/tmpluv4u7z7/'

In [34]:
ARTIFACT_URI = "s3://mlopszoomcamp-bucket/UK_house_price/67d71c7dec044fa8a64bf8e4a0b536dc/artifacts/models_prophet/model.pr"

# path = client.download_artifacts(artifact_uri=ARTIFACT_URI, run_id=RUN_ID)

In [37]:
load_model = mlflow.artifacts.download_artifacts(artifact_uri=ARTIFACT_URI)

load_model

'/tmp/tmpjr9x2w62/model.pr'

In [38]:
import pickle

with open(load_model, 'rb') as f_out:
    loaded_artifact = pickle.load(f_out)

UnpicklingError: invalid load key, '"'.

In [21]:
client.list_registered_models()

[]

### Load trained model from MLflow artifact

In [57]:
RUN_ID = "836ad6fb72de4878a6cafb093bbeb2e3"

logged_model = f'runs:/{RUN_ID}/models_prophet'
model_load = mlflow.pyfunc.load_model(logged_model)

  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]))


In [58]:
barnsley_train, barnsley_test = data_split(df, "Barnsley")
y_hat = evaluation(barnsley_test, model_load)

In [59]:
y_hat_additive, y_hat_low_additive, y_hat_up_additive = (
        max(y_hat["additive_terms"]),
       max(y_hat["additive_terms_lower"]),
        max(y_hat["additive_terms_upper"]),
    )

In [60]:
print(y_hat_additive)
print(y_hat_low_additive)
print(y_hat_up_additive)

1108.7033434177679
1108.7033434177679
1108.7033434177679


### Testing Flask on MLFlow Tracking Server (cont. from above)

In [73]:
import requests
from datetime import datetime, date

today = date.today()
input_date = datetime.strptime("2019-01-19", "%Y-%m-%d")

df_test = {
    "ds": [today.strftime("%Y-%m-%d")],
    "y": [250000]
}

features = {}
features['ds'] = df_test['ds']
features['y'] = df_test['y']

In [74]:
features

{'ds': ['2022-08-16'], 'y': [250000]}

In [75]:
feature_df = pd.DataFrame.from_dict(features)

In [76]:
y_hat_feature = evaluation(feature_df, model_load)

In [77]:
y_hat_feature['yhat_lower']
y_hat_feature['yhat_upper']
y_hat_feature['y_hat']

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2022-08-16,133448.435609,108242.751517,166478.603978,106027.198984,164232.798517,2385.043976,2385.043976,2385.043976,2385.043976,2385.043976,2385.043976,0.0,0.0,0.0,135833.479586


In [87]:
url = 'http://localhost:9696/predict'
response = requests.post(url, json=df_test)
print(response.json())

{'model_version': '836ad6fb72de4878a6cafb093bbeb2e3', 'y_hat': 135833.47958562724}


In [82]:
from pandas import Timestamp

result_list = [
                {'ds': Timestamp('2022-08-16 00:00:00'), 'trend': 133448.43560938345, 
                'yhat_lower': 106377.57638399057, 'yhat_upper': 167368.69821939082, 
                'trend_lower': 103667.75665957846, 'trend_upper': 165157.83761481557, 
                'additive_terms': 2385.0439762437973, 'additive_terms_lower': 2385.0439762437973,
                'additive_terms_upper': 2385.0439762437973, 'yearly': 2385.0439762437973,
                'yearly_lower': 2385.0439762437973, 'yearly_upper': 2385.0439762437973, 
                'multiplicative_terms': 0.0, 'multiplicative_terms_lower': 0.0, 
                'multiplicative_terms_upper': 0.0, 'yhat': 135833.47958562724
            }
        ]

In [93]:
datetime.strftime(result_list[0]['ds'], "%Y-%m-%d")

'2022-08-16'

### Alternative : Testing Flask on Loaded Model from S3

In [96]:
RUN_ID = "836ad6fb72de4878a6cafb093bbeb2e3"

logged_model = f's3://mlopszoomcamp-bucket/UK_house_price/{RUN_ID}/artifacts/models_prophet'
model_load_s3 = mlflow.pyfunc.load_model(logged_model)

input_date = datetime.strptime("2019-01-25", "%Y-%m-%d")

df_test = {
    "ds": [input_date.strftime("%Y-%m-%d")],
    "y": [270000]
}

features = {}
features['ds'] = df_test['ds']
features['y'] = df_test['y']

feature_df = pd.DataFrame.from_dict(features)

y_hat_feature = evaluation(feature_df, model_load_s3)

  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]))


In [97]:
url = 'http://localhost:9696/predict'
response = requests.post(url, json=df_test)
print(response.json())

{'date': '2019-01-25', 'model_version': '836ad6fb72de4878a6cafb093bbeb2e3', 'y_hat': 122701.50271087937, 'y_hat_lower': 119736.21396832094, 'y_hat_upper': 125906.64659605008}
