## Vectice Configuration

In [7]:
#Install Vectice Python library 
# In this notebook we will do code versioning using github, we also support gitlab
# and bitbucket: !pip install -q "vectice[github, gitlab, bitbucket]"
!pip install --q vectice[github]==22.3.5.1

[0m

In [8]:
#Verify if Vectice python library was installed
!pip3 show vectice

Name: vectice
Version: 22.3.5.1
Summary: Vectice Python library
Home-page: https://www.vectice.com
Author: Vectice Inc.
Author-email: sdk@vectice.com
License: Apache License 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: python-dotenv, requests, urllib3
Required-by: 


In [9]:
import json
f = open('DSP_JDN.json',)
DSP_JDN_key = json.load(f)

In [10]:
#Import the required packages
from vectice import Experiment
from vectice.api.json import ModelType
from vectice.api.json import JobType
from vectice.api.json import JobArtifactType
from vectice.api.json import ModelVersionStatus
from vectice.api.json import VersionStrategy
import logging
import os
logging.basicConfig(level=logging.INFO)

# Specify the API endpoint for Vectice.
# You can specify your API endpoint here in the notebook, but we recommand you to add it to a .env file
os.environ['VECTICE_API_ENDPOINT']= "app.vectice.com"

# To use the Vectice Python library, you first need to authenticate your account using an API token.
# You can generate an API token from the Vectice UI, by going to the "API Tokens" section in the "My Profile" section
# which is located under your profile picture.
# You can specify your API Token here in the notebook, but we recommend you to add it to a .env file
os.environ['VECTICE_API_TOKEN'] = DSP_JDN_key['key']

# Add you project id. The project id can be found in the project settings page in the Vectice UI
project_id = 4734

## Setup environment for ML

In [None]:
import os
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [12]:
import string
from math import sqrt

# Load scikit-learn packages
from sklearn.model_selection import train_test_split  # Model Selection
from sklearn.metrics import mean_absolute_error, mean_squared_error  # Model Evaluation
from sklearn.linear_model import LinearRegression  # Linear Regression
from sklearn.tree import DecisionTreeRegressor, plot_tree  # Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regression

## Load Data and Take a Peek

In [13]:
# Once your file is loaded you can view your dataset in a Pandas dataframe.
df = pd.read_csv('data/kc_house_data.csv')

In [14]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## Do a train/test split and capture it all w/ Vectice!

### 1. Create a Vectice job for the data prep stage

In [15]:
# We create our first experiment for data preparation and specify the workspace and the project we will be working on
# Each experiment only contains one job. Each invokation of the job is called a run.
# autocode = True enables you to track your git changes for your code automatically every time you execute a run (see below).
experiment = Experiment(job="Data Preparation", project=project_id, job_type=JobType.PREPARATION, auto_code=True)

INFO:Client:Successfully authenticated. You'll be working on Project: Predicting house prices in King County, Washington part of Workspace: Data Science
INFO:Project:Job with id: 10766 successfully retrieved.


### 2. Wrap the current dataset, the changes, and the code (this notebook) into an experiment run

In [16]:
# The Vectice library automatically detects if there have been changes to the dataset you are using.
# If it detects changes, it will generate a new version of your dataset automatically. 
# For this tutorial, we changed the data for demonstration purposes.
# So, the Vectice Python library will create a new dataset version when this code is executed for the first time.
experiment.add_dataset_version(dataset="cleaned_kc_house_data", version_strategy=VersionStrategy.AUTOMATIC)

#create a code checkpoint for this version of the notebook
input_code = experiment.add_code_version_uri(git_uri="https://github.com/stbiadmin/vectice-examples",
                                             entrypoint="demo/demo_dataprep.ipynb") 

# The created dataset version and code version will be automatically attached as inputs of the run
experiment.start(run_properties={"Property1": "Value 1", "property2": "Value 2"})

# We will use an 80/20 split to prepare the data
test_size = 0.2

# We will set the random seed so we always generate the same split.
random_state = 42

train, test = train_test_split(df, test_size = test_size, random_state = random_state)

# Generate X_train, X_test, y_train, y_test, which we will need for modeling
X = df.drop("price", axis=1).values
y = df["price"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


# We create new dataset versions 
train_ds_version = experiment.add_dataset_version(dataset="train_cleaned_kc_house_data", version_strategy=VersionStrategy.AUTOMATIC)
test_ds_version = experiment.add_dataset_version(dataset="test_cleaned_kc_house_data", version_strategy=VersionStrategy.AUTOMATIC)

# We complete the current experiment's run 
## The created dataset versions will be automatically attached as outputs of the run
experiment.complete()

INFO:Project:CodeVersion with id: 2786 successfully created in Code vectice-examples.
INFO:RunApi:Properties with names: ['Property1', 'property2'] successfully added to Run Run 2022-08-09T17:42:43.866956.
INFO:Job:Run with id: 18872 successfully created.


In [28]:
# We can preview one of our generated outputs to make sure that everything was executed properly.
X_train

array([[5467910190, '20140527T000000', 3, ..., -122.152, 2750, 13095],
       [9331800580, '20150310T000000', 2, ..., -122.29, 1270, 5000],
       [2407000405, '20150226T000000', 3, ..., -122.335, 1170, 7800],
       ...,
       [7202350480, '20140930T000000', 3, ..., -122.032, 1690, 2650],
       [1723049033, '20140620T000000', 1, ..., -122.323, 1170, 15000],
       [6147650280, '20150325T000000', 4, ..., -122.099, 3020, 5997]],
      dtype=object)

### 3. That's it! Keep on working.