# TruEra Python SDK
## Virtual Model Ingestion
## Sales Forecasting demo

## Pre-requisites: Download and Install Truera Python Client
1. Download Python wheel from [Downloads](/downloads) page.
Install the wheel in your Python environment using `pip install truera-*.whl`


In [253]:
import pandas as pd
import numpy as np
import pickle
import random

from truera.client.truera_workspace import TrueraWorkspace
from truera.client.truera_authentication import TokenAuthentication
from truera.client.truera_authentication import BasicAuthentication
from truera.client.ingestion import ColumnSpec, ModelOutputContext

from truera.client.ingestion.util import merge_dataframes_and_create_column_spec

import glob
import os

from datetime import datetime

------

## Virtual Model Ingestion

## Load all data

In [254]:
#for generating feature map
X_train_pre = pd.read_csv('./pre_train.csv',index_col=0).reset_index()
X_train_post = pd.read_csv('./post_train.csv',index_col=0).reset_index()
y = pd.read_csv('./labels_train.csv',index_col=0).reset_index()

##### Ridge Regression training, validation, and production data with:
- unique ids
- predictions
- feature influences
- timestamps (for production splits)

In [255]:
lr_train_data_df=pd.read_csv("lr_train_data_df.csv",index_col=[0]) 
lr_val_data_df=pd.read_csv("lr_val_data_df.csv",index_col=[0])  
lr_prod_data_df=pd.read_csv("lr_prod_data_df.csv",index_col=[0]) 

##### Random Forest Regressor training, validation, and production data with:
- unique ids
- predictions
- feature influences
- timestamps (for production splits)

In [256]:
rf_train_data_df=pd.read_csv("rf_train_data_df.csv",index_col=0)
rf_val_data_df=pd.read_csv("rf_val_data_df.csv",index_col=0)
rf_prod_data_df=pd.read_csv("rf_prod_data_df.csv",index_col=0)

In [257]:
with open('column_spec.pkl', 'rb') as f:
    column_spec = pickle.load(f)

In [258]:
with open('prod_column_spec.pkl', 'rb') as f:
    prod_column_spec = pickle.load(f)

In [259]:
with open('background_column_spec.pkl', 'rb') as f:
    background_column_spec = pickle.load(f)

In [260]:
background_column_spec

ColumnSpec(id_col_name='index', ranking_item_id_column_name=None, ranking_group_id_column_name=None, timestamp_col_name='datetime', tags_col_name=None, extra_data_col_names=[], pre_data_col_names=['store', 'brand', 'week', 'feat', 'price', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME', 'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST', 'SSTRVOL', 'CPDIST5', 'CPWVOL5'], post_data_col_names=['store', 'week', 'feat', 'price', 'AGE60', 'EDUC', 'ETHNIC', 'INCOME', 'HHLARGE', 'WORKWOM', 'HVAL150', 'SSTRDIST', 'SSTRVOL', 'CPDIST5', 'CPWVOL5', 'brand_dominicks', 'brand_minute.maid', 'brand_tropicana'], prediction_col_names=[], label_col_names=[], feature_influence_col_names=[])

note that background_column_spec has no predictions or labels. These can be included, but they are not required. 

## Create Project
A project is a collection of models and datasets solving a single problem statement.
Users can be provided access to collaborate on a project.

In [None]:
# connection details
TRUERA_URL = "https://app.truera.net"
AUTH_TOKEN = "<insert auth token>"

In [None]:
auth = TokenAuthentication(AUTH_TOKEN)
tru = TrueraWorkspace(TRUERA_URL, auth, ignore_version_mismatch=True)

In [None]:
project_name = "Sales Forecasting - Virtual Models"

In [None]:
tru.add_project(project_name, score_type='regression')

In [None]:
tru.activate_client_setting('create_model_tests_on_split_ingestion')

## Data Collection
1. Use data schema, pre- & post-feature engineering, to create feature map
2. Add new data collection to project with feature map

In [None]:
with open('feature_map.pkl', 'rb') as f:
    FEATURE_MAP = pickle.load(f)

In [None]:
FEATURE_MAP

In [None]:
tru.add_data_collection("OJ Sales Data LR", pre_to_post_feature_map=FEATURE_MAP, provide_transform_with_model=False)

## Model 1: Ridge Regression
1. add a 'virtual' model -- placeholder for associated I/O data that will be ingested
2. add data
* background split - creates basis for interpretation of feature influences associated with subsequent dev & prod data
* training data
* validation data
* production data

Note that there are three separate column specs used:
1. background column spec -- index, pre, post, labels, and predictions (optional). No feature influences.
2. dev column spec -- index, pre, post, labels, predictions, and feature influences.
3. prod column spec -- index, pre, post, labels, predictions, feature influences, and timestamps. 

In [None]:
model_name = 'Ridge Regression'
tru.add_python_model(model_name)

### Background data

In [None]:
tru.add_data(
        data=background_data_df,
        data_split_name='background data',
        column_spec=background_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            influence_type='truera-qii',
            score_type='regression'))

### Training Data

In [None]:
tru.add_data(
        data=lr_train_data_df,
        data_split_name='training data',
        column_spec=column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))

### Validation Data

In [None]:
tru.add_data(
        data=lr_val_data_df,
        data_split_name='validation data',
        column_spec=column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))

### Production Data

In [None]:
model_name = 'Ridge Regression'
tru.add_production_data(
        data=lr_prod_data_df,
        column_spec=prod_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))

## Model 2: Random Forest Regressor
1. add a 'virtual' model -- placeholder for associated I/O data that will be ingested
2. add data
* background split - creates basis for interpretation of feature influences associated with subsequent dev & prod data
* training data
* validation data
* production data

In [None]:
tru.add_data_collection("OJ Sales Data RF", pre_to_post_feature_map=FEATURE_MAP, provide_transform_with_model=False)

In [None]:
model_name = 'Random Forest Regressor'
tru.add_python_model(model_name)

### Background data

In [None]:
tru.add_data(
        data=background_data_df,
        data_split_name='background data',
        column_spec=background_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            influence_type='truera-qii',
            score_type='regression'))

### Training Data

In [None]:
tru.add_data(
        data=rf_train_data_df,
        data_split_name='training data',
        column_spec=column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))

### Validation Data

In [None]:
tru.add_data(
        data=rf_val_data_df,
        data_split_name='validation data',
        column_spec=column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))

### Production Data

In [None]:
tru.add_production_data(
        data=rf_prod_data_df,
        column_spec=prod_column_spec,
        model_output_context=ModelOutputContext(
            model_name=model_name,
            background_split_name='background data',
            influence_type='truera-qii',
            score_type='regression'))