# NOTE:

I couldn't get the **aiplatform** library to work locally, no matter how hard I tried, there were always some package dependency issues.

It is however successfully deployed in a Jupyter Notebook on the Vertex AI workbench.

In [1]:
# import all the libraries that you need at the top of the notebook
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from google.cloud import aiplatform

In [2]:
DATASET_LOCATION = './kc_house_data.csv'

In [3]:
def load_data(data_dir):
    # load the data with correct data types
    df = pd.read_csv(data_dir)

    # return the data
    return df

In [4]:
def transform_data(df):
    # transform the data
    df.drop(columns=['id'], inplace=True)

    df['date'] = pd.to_datetime(df['date'])

    df['zipcode'] = df['zipcode'].astype(str)

    one_hot = OneHotEncoder()
    encoded = one_hot.fit_transform(df[['zipcode']])
    df[one_hot.categories_[0]] = encoded.toarray()

    df.drop('zipcode', axis=1, inplace=True)

    df.drop(['lat', 'long'], axis=1, inplace=True)

    # return the transformed data
    return df

In [5]:
def remove_outliers(df):
    dataframe_with_removed_outliers = df.copy()

    index_names = dataframe_with_removed_outliers[dataframe_with_removed_outliers['bedrooms'] > 13].index
    dataframe_with_removed_outliers.drop(index_names, inplace=True)

    index_names = dataframe_with_removed_outliers[dataframe_with_removed_outliers['price'] > 6000000].index
    dataframe_with_removed_outliers.drop(index_names, inplace=True)

    index_names = dataframe_with_removed_outliers[dataframe_with_removed_outliers['sqft_living'] > 10000].index
    dataframe_with_removed_outliers.drop(index_names, inplace=True)

    return dataframe_with_removed_outliers

In [6]:
def split_data(df):
    # split the data
    x = df.loc[:, ~dataframe.columns.isin(['price', 'date'])]
    y = df['price']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

    # return the train and test data
    return x_train, x_test, y_train, y_test

In [7]:
def normalize_data(x_train, x_test, y_train, y_test):
    # standardize the data
    mean_x = x_train.mean()
    std_x = x_train.std()

    mean_y = y_train.mean()
    std_y = y_train.std()

    x_train_norm = (x_train - mean_x) / std_x
    x_test_norm = (x_test - mean_x) / std_x

    y_train_norm = (y_train - mean_y) / std_y
    y_test_norm = (y_test - mean_y) / std_y

    # return the standardized data
    return x_train_norm, x_test_norm, y_train_norm, y_test_norm

In [8]:
# loading the dataset
dataframe = load_data(DATASET_LOCATION)
transform_data(dataframe)
remove_outliers(dataframe)

x_train, x_test, y_train, y_test = split_data(dataframe)

# testing if the endpoint works
ensemble_model_endpoint = aiplatform.Endpoint(
    endpoint_name="projects/367901149883/locations/europe-west1/endpoints/8529870470797852672")

In [9]:
# we can use these index values for testing the predictions
print(x_test)

       bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
19836         3       2.50         2437      5136     2.0           0     0   
10442         3       2.50         1560      4800     2.0           0     0   
20548         3       2.50         2390     47480     2.0           0     0   
11014         3       1.00         1480      5100     1.5           0     0   
4138          4       3.50         1880      1341     3.0           0     0   
...         ...        ...          ...       ...     ...         ...   ...   
5828          4       2.50         2811      7251     2.0           0     0   
9478          3       3.00         1910      4800     1.5           0     0   
13086         4       2.25         2450     11960     1.0           0     0   
19162         3       2.25         1453      2225     2.0           0     0   
11535         3       3.25         4560     13363     1.0           0     4   

       condition  grade  sqft_above  ...  98146  98

In [10]:
values_for_prediction = x_test.loc[4138].values.reshape(1, -1).tolist()
print(values_for_prediction)

[[4.0, 3.5, 1880.0, 1341.0, 3.0, 0.0, 0.0, 3.0, 8.0, 1650.0, 230.0, 2007.0, 0.0, 1740.0, 1883.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]


In [11]:
endpoint_prediction = ensemble_model_endpoint.predict(instances=values_for_prediction).predictions

In [12]:
print(f'Endpoint prediction: {endpoint_prediction}')

Endpoint prediction: [554899.7622373143]
