## Vectice Configuration

In [None]:
#Install Vectice Python library 
# In this notebook we will do code versioning using github, we also support gitlab
# and bitbucket: !pip install -q "vectice[github, gitlab, bitbucket]"
!pip install --q vectice[github]==22.3.5.1

In [None]:
#Verify if Vectice python library was installed
!pip3 show vectice

In [3]:
import json
f = open('DSP_JDN.json',)
DSP_JDN_key = json.load(f)

In [4]:
#Import the required packages
from vectice import Experiment
from vectice.api.json import ModelType
from vectice.api.json import JobType
from vectice.api.json import JobArtifactType
from vectice.api.json import ModelVersionStatus
from vectice.api.json import VersionStrategy
from vectice import Vectice
import logging
import os
logging.basicConfig(level=logging.INFO)

# Specify the API endpoint for Vectice.
# You can specify your API endpoint here in the notebook, but we recommand you to add it to a .env file
os.environ['VECTICE_API_ENDPOINT']= "app.vectice.com"

# To use the Vectice Python library, you first need to authenticate your account using an API token.
# You can generate an API token from the Vectice UI, by going to the "API Tokens" section in the "My Profile" section
# which is located under your profile picture.
# You can specify your API Token here in the notebook, but we recommend you to add it to a .env file
os.environ['VECTICE_API_TOKEN'] = DSP_JDN_key['key']

# Add you project id. The project id can be found in the project settings page in the Vectice UI
project_id = 4734
workspace_id = 1300

In [5]:
# Authenticate to Vectice
vectice = Vectice(workspace_id, project_id)

INFO:Client:Successfully authenticated. You'll be working on Project: Predicting house prices in King County, Washington part of Workspace: .jnorman


## Setup environment for ML

In [6]:
import os
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


In [7]:
import string
import pickle
from math import sqrt

# Load scikit-learn packages
from sklearn.model_selection import train_test_split  # Model Selection
from sklearn.metrics import mean_absolute_error, mean_squared_error  # Model Evaluation
from sklearn.linear_model import LinearRegression  # Linear Regression
from sklearn.tree import DecisionTreeRegressor, plot_tree  # Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regression

## Load Data and Take a Peek

In [8]:
# Once your file is loaded you can view your dataset in a Pandas dataframe.
df = pd.read_csv('s3://sagemaker-us-east-1-062143896379/data/train_cleaned_kc_house_data.csv/')

In [9]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,510000.0,4,3.0,3610,18948,2.0,0,0,3,10,3610,0,1993,0,98023,47.2911,-122.342,3568,18948
1,555000.0,3,2.0,2080,7020,1.0,0,0,4,7,1040,1040,1951,0,98115,47.6768,-122.285,1920,7000
2,469500.0,4,2.5,2090,7241,1.0,0,0,4,7,1140,950,2001,0,98034,47.726,-122.221,1510,7402
3,530000.0,3,2.5,3150,21893,2.0,0,0,3,9,3150,0,2006,0,98014,47.6455,-121.901,2280,21886
4,525000.0,3,2.0,1540,7800,1.0,0,0,3,8,1540,0,2004,0,98125,47.7041,-122.288,1510,7800


In [13]:
vectice.create_dataset_version(dataset="cleaned_kc_house_data",name="Version 4", version_strategy=VersionStrategy.MANUAL)

INFO:Dataset:DatasetVersion with id: 13127 successfully created in Dataset cleaned_kc_house_data.


DatasetVersion(dataset=Dataset(name=cleaned_kc_house_data, id=7768, description=, connection=None, resources=None), id=13127, description=None, is_starred=False, auto_version=True, name=Version 4, properties=None, version=None)

## Do a train/test split and capture it all w/ Vectice!

### 1. Create a Vectice job for the data prep stage

In [14]:
# We create our first experiment for data preparation and specify the workspace and the project we will be working on
# Each experiment only contains one job. Each invokation of the job is called a run.
# autocode = True enables you to track your git changes for your code automatically every time you execute a run (see below).
experiment = Experiment(job="Data Preparation", project=project_id, job_type=JobType.PREPARATION, auto_code=True)

INFO:Client:Successfully authenticated. You'll be working on Project: Predicting house prices in King County, Washington part of Workspace: .jnorman
INFO:Project:Job with id: 10766 successfully retrieved.


### 2. Wrap the current dataset, the changes, and the code (this notebook) into an experiment run

In [15]:
# The Vectice library automatically detects if there have been changes to the dataset you are using.
# If it detects changes, it will generate a new version of your dataset automatically. 
# For this tutorial, we changed the data for demonstration purposes.
# So, the Vectice Python library will create a new dataset version when this code is executed for the first time.
#experiment.add_dataset_version(dataset="cleaned_kc_house_data", version_strategy=VersionStrategy.AUTOMATIC)
experiment.add_dataset_version(dataset="cleaned_kc_house_data", version_strategy=VersionStrategy.AUTOMATIC)

#create a code checkpoint for this version of the notebook
input_code = experiment.add_code_version_uri(git_uri="https://github.com/stbiadmin/vectice-examples",
                                             entrypoint="demo/demo_dataprep.ipynb") 

# The created dataset version and code version will be automatically attached as inputs of the run
experiment.start(run_properties={"Property1": "Test 1", "property2": "Test 2"})

# We will use an 80/20 split to prepare the data
train_ratio = 0.80
test_ratio = 0.20
valid_ratio = 0.20

# We will set the random seed so we always generate the same split.
random_state = 42

train, test = train_test_split(df, test_size = test_ratio, random_state = random_state)


# Generate X_train, X_test, y_train, y_test, which we will need for modeling
X = df.drop("price", axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = valid_ratio/(train_ratio+test_ratio))

#serialize all data and save locally for later use
with open('data/serial/X_train.pkl','wb') as file:
    pickle.dump(X_train,file)

with open('data/serial/y_train.pkl','wb') as file:
    pickle.dump(y_train,file)

with open('data/serial/X_test.pkl','wb') as file:
    pickle.dump(X_test,file)

with open('data/serial/y_test.pkl','wb') as file:
    pickle.dump(y_test,file)
    
with open('data/serial/X_valid.pkl','wb') as file:
    pickle.dump(X_valid,file)

with open('data/serial/y_valid.pkl','wb') as file:
    pickle.dump(y_valid,file)

# We create new dataset versions 
train_ds_version = experiment.add_dataset_version(dataset="train_cleaned_kc_house_data", version_strategy=VersionStrategy.MANUAL)
test_ds_version = experiment.add_dataset_version(dataset="test_cleaned_kc_house_data", version_strategy=VersionStrategy.MANUAL)

# We complete the current experiment's run 
## The created dataset versions will be automatically attached as outputs of the run
experiment.complete()

INFO:Project:CodeVersion with id: 2862 successfully created in Code vectice-examples.
INFO:RunApi:Properties with names: ['Property1', 'property2'] successfully added to Run Run 2022-08-23T17:40:19.072279.
INFO:Job:Run with id: 19576 successfully created.
INFO:Dataset:DatasetVersion with id: 13128 successfully created in Dataset train_cleaned_kc_house_data.
INFO:Dataset:DatasetVersion with id: 13129 successfully created in Dataset test_cleaned_kc_house_data.


In [None]:
## Should log these variables to Vectice as new ML dataset split assets.

In [None]:
# We can preview one of our generated outputs to make sure that everything was executed properly.
X_train.head()

### 3. Now, let's save the new training and testing datasets to CSV and upload to S3 for posterity

In [16]:
#Save the training and testing datasets to csv
train.to_csv('s3://sagemaker-us-east-1-062143896379/data/train_cleaned_kc_house_data.csv',index = False)

In [17]:
test.to_csv('s3://sagemaker-us-east-1-062143896379/data/test_cleaned_kc_house_data.csv',index = False)

In [18]:
#test that we can access the uploaded dataset later
df_test = pd.read_csv('s3://sagemaker-us-east-1-062143896379/data/train_cleaned_kc_house_data.csv')
df_test.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,510000.0,4,3.0,3610,18948,2.0,0,0,3,10,3610,0,1993,0,98023,47.2911,-122.342,3568,18948
1,555000.0,3,2.0,2080,7020,1.0,0,0,4,7,1040,1040,1951,0,98115,47.6768,-122.285,1920,7000
2,469500.0,4,2.5,2090,7241,1.0,0,0,4,7,1140,950,2001,0,98034,47.726,-122.221,1510,7402
3,530000.0,3,2.5,3150,21893,2.0,0,0,3,9,3150,0,2006,0,98014,47.6455,-121.901,2280,21886
4,525000.0,3,2.0,1540,7800,1.0,0,0,3,8,1540,0,2004,0,98125,47.7041,-122.288,1510,7800


### 4. That's it! Keep on working, or come back to the project later.