# TruEra Python SDK - Retrieve predictions & feature influences from existing project

NOTE: 
1. Use this notebook to generate and persist predictions and feature influences from existing project
2. This is a pre-requisite for full 'virtual' model ingestion


In [76]:
import pandas as pd
import numpy as np
import pickle
import random
import sklearn
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import TokenAuthentication
from truera.client.truera_authentication import BasicAuthentication
from truera.client.ingestion import ColumnSpec, ModelOutputContext

# TruEra SDK
## Create Project
A project is a collection of models and datasets solving a single problem statement.
Users can be provided access to collaborate on a project.

In [77]:
# connection details
TRUERA_URL = "https://app.truera.net"
AUTH_TOKEN = "eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJUcnVFcmFfVG9rZW5fSXNzdWVyX2F3cy1wcmQtdmExLWluZnJhMS1hcHAtcHJkIiwiaWF0IjoxNjk1MjE3MDI3LCJleHAiOjE2OTU0NzYyMjcsInN1YiI6IjE1Y2E3ZWRmOTM2YjFkYWM4ZTA0ZDc2NjFkMTY1NzVlIiwiaWQiOiIxNWNhN2VkZjkzNmIxZGFjOGUwNGQ3NjYxZDE2NTc1ZSIsIm5hbWUiOiJjb2xpbitwcm9kX2RlbW9AdHJ1ZXJhLmNvbSIsImVtYWlsIjoiY29saW4rcHJvZF9kZW1vQHRydWVyYS5jb20iLCJ0ZW5hbnRfaWQiOiIwZThiNzMyYS1hOGRmLTQzYmItYmI4ZS02ODVhNTk1MmYxYmUifQ.CoUapyFhYiurAqmW56YsS6QGdrEJI3YIVbylvnWbt68xSgLlPd4ZzgZkN-8KMZBnCjCxbGKqzRT_Gv8uAdyMrQ"

In [78]:
auth = TokenAuthentication(AUTH_TOKEN)
tru = TrueraWorkspace(TRUERA_URL, auth, ignore_version_mismatch=True)

INFO:truera.client.remote_truera_workspace:Connecting to 'https://app.truera.net'


RuntimeError: The provided token has expired.

In [6]:
tru.get_projects()

['Credit-Risk-Monitoring-Demo',
 'House-Price-Monitoring-Demo',
 'Sales Forecasting',
 'Sales Forecasting v1-1',
 'Sales Forecasting v1-2',
 'Sales Forecasting v1-4',
 'Sales Forecasting - Virtual']

In [8]:
project_name = "Sales Forecasting"

In [9]:
tru.set_project(project_name)

## Retrieve Feature Influences

In [70]:
tru.set_model("Ridge Regression")
tru.set_data_split("training data")

In [67]:
lr_ys = tru.get_ys()

In [68]:
len(lr_ys)

9649

In [71]:
lr_train_feat_infs = tru.compute_feature_influences(stop=9650)
## Note: we need predictions, to generate feature influences. 
## In other words, (some) predictions are being generated as part of this call

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmpfxy8otz4.parquet (1.5MiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 01a76346-72f5-4c04-8411-aea12c701d3f finished with status: SUCCEEDED.


In [72]:
lr_train_feat_infs.to_csv('lr_train_FIs.csv')

In [16]:
tru.set_model("Ridge Regression")
tru.set_data_split("validation data")

In [17]:
lr_val_feat_infs = tru.get_feature_influences()

In [None]:
lr_val_feat_infs.to_csv('lr_val_FIs.csv')

In [73]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("training data")

INFO:truera.client.remote_truera_workspace:Setting model context to "Random Forest Regressor".


In [74]:
rf_train_feat_infs = tru.compute_feature_influences(stop=9650)

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


|          | 0.000% [00:00<?]

Uploading tmpfit3qfaw.parquet (1.5MiB) -- ### -- file upload complete.
Put resource done.


INFO:truera.client.remote_truera_workspace:Waiting for data split to materialize...
INFO:truera.client.remote_truera_workspace:Materialize operation id: 298ba5b8-ab20-477d-8d6f-da0b40b3bb45 finished with status: SUCCEEDED.


In [75]:
rf_train_feat_infs.to_csv('rf_train_FIs.csv')

In [20]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("validation data")

In [21]:
rf_val_feat_infs = tru.get_feature_influences()

In [22]:
rf_val_feat_infs.to_csv('rf_val_FIs.csv')

## Retrieve predictions

In [23]:
tru.set_model("Ridge Regression")
tru.set_data_split("training data")

INFO:truera.client.remote_truera_workspace:Setting model context to "Ridge Regression".


In [24]:
lr_train_preds = tru.get_ys_pred()
## Note: we need predictions, to generate feature influences. 
## In other words, (some) predictions are being generated as part of this call

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data". 
INFO:truera.client.truera_workspace:Syncing data split "training data" to local.
INFO:truera.client.local.local_truera_workspace:Data split "training data" is added to local data collection "OJ Sales Data", and set as the data split for the workspace context.
INFO:truera.client.truera_workspace:Downloading model Ridge Regression...
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.
INFO:truera.client.local.local_truera_workspace:The previous data collection ("OJ Sales Data") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.local.local_truera_workspace:Data collecti

In [26]:
tru.set_model("Ridge Regression")
tru.set_data_split("validation data")

In [27]:
lr_val_preds = tru.get_ys_pred()

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.truera_workspace:Syncing data split "validation data" to local.
INFO:truera.client.local.local_truera_workspace:Data split "validation data" is added to local data collection "OJ Sales Data", and set as the data split for the workspace context.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.


In [28]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("training data")

INFO:truera.client.remote_truera_workspace:Setting model context to "Random Forest Regressor".


In [29]:
rf_train_preds = tru.get_ys_pred()

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.truera_workspace:Downloading model Random Forest Regressor...
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [31]:
tru.set_model("Random Forest Regressor")
tru.set_data_split("validation data")

In [32]:
rf_val_preds = tru.get_ys_pred()

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpyl0irmr0
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [37]:
lr_train_preds.to_csv('lr_train_preds.csv')
lr_val_preds.to_csv('lr_val_preds.csv')
rf_train_preds.to_csv('rf_train_preds.csv')
rf_val_preds.to_csv('rf_val_preds.csv')

# Monitoring: Production Data
1. Generate predictions from existing project
 - note -- not clear how to retrieve predictions (or feature influences) from a prod split that already exists, currently
2. Generate feature influences from existing project using .get_feature_influences_for_data
3. Persist 1 & 2 for use elsewhere

In [38]:
import glob
import os

In [39]:
from datetime import datetime

In [42]:
from truera.client.ingestion.util import merge_dataframes_and_create_column_spec

In [43]:
def load_prod_data(start, end):
    start = datetime.strptime(start, '%Y-%m-%d').date()
    end = datetime.strptime(end, '%Y-%m-%d').date()
    print(type(start))
    
    #gather files to include
    f_prod = glob.glob(os.path.join('./split_sim', 'pre_split_*.csv'))
    f_prod_post= glob.glob(os.path.join('./split_sim', "post_split_*.csv"))
    f_y_prod = glob.glob(os.path.join('./split_sim', "label_*.csv"))
    
    #sort file names
    f_prod.sort()
    f_prod_post.sort()
    f_y_prod.sort()
    
    X_prod = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_prod), ignore_index=True)
    X_prod_post = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_prod_post), ignore_index=True)
    y_prod = pd.concat((pd.read_csv(f,index_col=0).reset_index() for f in f_y_prod), ignore_index=True)
        
    prod_data_df, column_spec = merge_dataframes_and_create_column_spec(id_col_name='index',
                                                                        timestamp_col_name='datetime',
                                                                        pre_data=X_prod,
                                                                        post_data=X_prod_post,
                                                                        labels=y_prod)
    #greater than the start date and smaller than the end date
    prod_data_df['datetime'] = pd.to_datetime(prod_data_df['datetime']).dt.date
    prod_data_df = prod_data_df[(prod_data_df['datetime'] >= start) & (prod_data_df['datetime'] <= end)]
    print(prod_data_df.datetime.min())
    print(prod_data_df.datetime.max())
    print(prod_data_df.shape)
    print(column_spec)
    return prod_data_df, column_spec

In [58]:
prod_data_df, column_spec = load_prod_data('2023-08-24', '2023-09-20')

<class 'datetime.date'>
2023-08-24
2023-09-20
(6666, 22)
ColumnSpec(id_col_name='index', ranking_item_id_column_name=None, ranking_group_id_column_name=None, timestamp_col_name='datetime', tags_col_name=None, extra_data_col_names=[], pre_data_col_names=['store', 'brand', 'week', 'feat', 'price', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME', 'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST', 'SSTRVOL', 'CPDIST5', 'CPWVOL5'], post_data_col_names=['store', 'week', 'feat', 'price', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME', 'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST', 'SSTRVOL', 'CPDIST5', 'CPWVOL5', 'brand_dominicks', 'brand_minute.maid', 'brand_tropicana'], prediction_col_names=[], label_col_names=['logmove'], feature_influence_col_names=[])


In [59]:
def generate_prod_preds(model, data):
    preds = model.predict(data.drop(columns=data.columns.difference(column_spec.post_data_col_names)))
    preds_df = pd.DataFrame(preds, columns = ['preds'], index=[data['index'], data.datetime])
    preds_df = preds_df.reset_index()
    print(preds_df.shape)

    return preds_df

In [60]:
lin_reg = pickle.load(open("linreg.pkl", 'rb'))

In [61]:
random_forest = pickle.load(open("rf.pkl", 'rb'))

In [62]:
lr_prod_preds = generate_prod_preds(lin_reg, prod_data_df)

(6666, 3)


In [63]:
rf_prod_preds = generate_prod_preds(random_forest, prod_data_df)

(6666, 3)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [64]:
lr_prod_preds.to_csv('lr_prod_preds.csv',index=False)
rf_prod_preds.to_csv('rf_prod_preds.csv',index=False)

## Prod Feature Infs

In [None]:
tru.get_data_collections()

['OJ Sales Data', 'OJ Sales Data RF']

In [None]:
tru.set_data_collection('OJ Sales Data')
tru.set_model('Ridge Regression')

INFO:truera.client.remote_truera_workspace:Data collection in remote environment is now set to "OJ Sales Data". The previous data collection ("OJ Sales Data RF") and its associated data splits and/or models have been cleared from the remote environment workspace context.
INFO:truera.client.remote_truera_workspace:Setting model context to "Ridge Regression".


In [None]:
LR_explainer = tru.get_explainer()

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpq7sccvi1
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data". The previous data collection ("OJ Sales Data RF") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data RF". The previous data collection ("OJ Sales Data") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data" to local.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data". The previous data collection ("OJ Sales Data RF") and its associated data splits and/or models have been cleared from the local envi

In [723]:
LR_prod_FIs = LR_explainer.compute_feature_influences_for_data(pre_data = prod_data_df[column_spec.pre_data_col_names], 
                                                         post_data = prod_data_df[column_spec.post_data_col_names], 
                                                         ys = prod_data_df[column_spec.label_col_names])

|          | 0.000% [00:00<?]

In [724]:
LR_prod_FIs['index'] = prod_data_df[column_spec.id_col_name]

In [725]:
LR_prod_FIs

Unnamed: 0,store,brand,week,feat,price,AGE60,EDUC,ETHNIC,INCOME,HHLARGE,WORKWOM,HVAL150,SSTRDIST,SSTRVOL,CPDIST5,CPWVOL5,index
731,0.055982,0.817927,0.011931,-0.261526,-0.696450,-0.195724,0.008100,-0.028564,-0.037631,-0.021229,0.009966,-0.005971,-0.033714,-0.029532,0.069651,-0.106460,10380
732,0.023285,0.757573,0.011852,-0.260281,-0.682205,-0.083507,0.012644,-0.070437,-0.033998,-0.000199,0.016401,0.047723,-0.155344,0.007606,-0.047751,0.123172,10381
733,0.003053,-0.790970,0.012053,0.955194,1.623671,-0.113433,-0.014814,-0.075228,-0.054493,-0.023903,-0.004917,0.091338,0.092841,-0.017299,-0.033029,-0.091317,10382
734,-0.071289,-0.806746,0.011785,0.995045,1.654367,0.010930,0.011585,0.126703,0.212821,0.029623,0.011177,0.025799,-0.097722,0.005159,-0.080015,0.179821,10383
735,0.058566,0.026001,0.011533,-0.297642,0.437421,0.086718,0.123247,-0.028178,-0.124800,0.011412,-0.011077,0.315248,-0.021113,0.020103,-0.082994,0.055816,10384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7392,-0.074614,0.778274,0.025401,-0.265262,-0.425223,0.222538,-0.002890,-0.069130,-0.055780,0.009981,-0.000259,0.092745,0.088248,0.019475,-0.018000,-0.001411,17041
7393,0.012253,-0.798891,0.025764,-0.290170,1.020321,-0.082758,0.019182,0.126206,-0.015714,-0.018513,-0.012287,-0.123608,0.029745,-0.040419,-0.031319,0.047082,17042
7394,-0.024870,-0.808236,0.025652,-0.257790,1.027813,0.051018,0.002960,-0.060369,-0.072532,0.004550,-0.000481,0.138305,0.002161,0.027962,0.056774,-0.023536,17043
7395,-0.003358,-0.776650,0.025771,-0.250318,0.991968,-0.169090,0.063139,-0.046665,-0.130255,-0.001640,0.019803,0.174844,0.070370,0.001435,0.023943,-0.059990,17044


In [726]:
tru

{
    "project": "Sales Forecasting v1-4",
    "data-collection": "OJ Sales Data",
    "data-split": "",
    "model": "Ridge Regression",
    "connection-string": "https://app.truera.net",
    "model_execution": "local"
}

In [730]:
tru.set_data_collection("OJ Sales Data RF")
tru.set_model("Random Forest Regressor")

INFO:truera.client.remote_truera_workspace:Data collection in remote environment is now set to "OJ Sales Data RF". The previous data collection ("OJ Sales Data") and its associated data splits and/or models have been cleared from the remote environment workspace context.
INFO:truera.client.remote_truera_workspace:Setting model context to "Random Forest Regressor".


In [731]:
RF_explainer = tru.get_explainer()

INFO:truera.client.truera_workspace:Download temp_dir: /var/folders/xy/j480xtkx56dd7r7r1h8q8tl40000gn/T/tmpq7sccvi1
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data RF". The previous data collection ("OJ Sales Data") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.truera_workspace:Syncing data collection "OJ Sales Data RF" to local.
INFO:truera.client.truera_workspace:Syncing segments groups from remote to local.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data". The previous data collection ("OJ Sales Data RF") and its associated data splits and/or models have been cleared from the local environment workspace context.
INFO:truera.client.local.local_truera_workspace:Data collection in local environment is now set to "OJ Sales Data RF". The previous data collection ("OJ Sales Data

In [732]:
RF_prod_FIs = RF_explainer.compute_feature_influences_for_data(pre_data = prod_data_df[column_spec.pre_data_col_names],
                                                                post_data = prod_data_df[column_spec.post_data_col_names],
                                                                ys = prod_data_df[column_spec.label_col_names])

|          | 0.000% [00:00<?]

In [735]:
RF_prod_FIs['index'] = prod_data_df[column_spec.id_col_name]

In [154]:
LR_prod_FIs.to_csv('lr_prod_FIs.csv',index=True)

In [155]:
RF_prod_FIs.to_csv('rf_prod_FIs.csv',index=True)