# House Price Prediction

In [9]:
import pandas as pd
import numpy as np

import google.cloud.aiplatform as aip
from google.cloud import storage
import gcsfs

from typing import NamedTuple

from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        Markdown)

from kfp.v2 import compiler


In [4]:
BUCKET_URI = 'gs://mle-gcp-tbk-1'
PIPELINE_ROOT = "{}/pipeline_root/house_price".format(BUCKET_URI)

In [5]:
PROJECT_ID = "tiger-mle"
REGION = "us-east1"
ZONE = ""

In [6]:
! gcloud config set project $PROJECT_ID
! gcloud config set compute/region $REGION

Updated property [core/project].
Updated property [compute/region].


In [7]:
!gcloud auth list

                  Credentialed Accounts
ACTIVE  ACCOUNT
*       378786916136-compute@developer.gserviceaccount.com

To set the active account, run:
    $ gcloud config set account `ACCOUNT`



In [8]:
project_id = 'tiger-mle'
file_uri = 'gs://vertex-ai-bucket-house-price-pred/data_base/housing.csv'

### Reading data from GCP Bucket

In [12]:
@component(packages_to_install=['gcsfs==2022.02.0','pandas==1.1.4','scikit-learn==1.0.1'])
def get_data(project_id: str,file_uri: str,house_dataset: Output[Dataset]):
    import gcsfs
    import pandas as pd
    from sklearn.model_selection import train_test_split as tts
    
    fs = gcsfs.GCSFileSystem()
    f = fs.open(file_uri)
    df = pd.read_csv(f)
    
    """train, test = tts(df, test_size=0.3)
    
    train.to_csv(dataset_train.path)
    test.to_csv(dataset_test.path)"""
    
    df.to_csv(house_dataset.path)
    #return df

### Preprocessing the data

In [17]:
@component(  packages_to_install = [
        "pandas==1.3.4",
        "xgboost==1.5.1",
        "scikit-learn==1.0.1",
    ],
)
def preprocess(
        dataset: Input[Dataset],
        dataset_train: Output[Dataset],
        dataset_test: Output[Dataset]):
    import pandas as pd
    from sklearn.model_selection import train_test_split as tts
    
    housing = pd.read_csv(dataset.path)
    print('ok')
    
    #train_set, test_set = tts(data, test_size=0.2, random_state=42)
    
    
    imputer = SimpleImputer(strategy="median")

    housing_num = housing.drop("ocean_proximity", axis=1)

    imputer.fit(housing_num)
    X = imputer.transform(housing_num)
    
    housing_num["income_cat"] = pd.cut(
        housing_num["median_income"],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1, 2, 3, 4, 5]
    )

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing_num, housing_num["income_cat"]):
        strat_train_set = housing_num.loc[train_index]
        strat_test_set = housing_num.loc[test_index]
        
    strat_train_set.to_csv(dataset_train.path)
    strat_test_set.to_csv(dataset_test.path)

### Building a Pipeline

In [100]:
@dsl.pipeline(
    name="house-price-pred",
    description="House prediction",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(project_id: str = 'tiger-mle',
                 file_uri: str = 'gs://vertex-ai-bucket-house-price-pred/data_base/housing.csv'):
    get_data_op = get_data(project_id,file_uri)
    pipe_preprocess = preprocess(get_data_op.outputs['house_dataset'])

### Complie

In [101]:
from kfp.v2 import compiler  # noqa: F811

compiler.Compiler().compile(pipeline_func=pipeline, package_path="house_pipeline.json")

In [44]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Run the job

In [102]:
DISPLAY_NAME = "intro_" + TIMESTAMP

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="house_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/378786916136/locations/us-central1/pipelineJobs/house-price-pred-20220511165419
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/378786916136/locations/us-central1/pipelineJobs/house-price-pred-20220511165419')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/house-price-pred-20220511165419?project=378786916136
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/378786916136/locations/us-central1/pipelineJobs/house-price-pred-20220511165419 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/378786916136/locations/us-central1/pipelineJobs/h