# Purpose


This notebook demonstrates the data pipeline from raw tables to analytical datasets. At the end of this activity, train & test data sets are created from raw data.



## Imports

In [87]:
from pprint import pprint
import os
import os.path as op
import shutil
import matplotlib.pyplot as plt
import seaborn as sns

# standard third party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True


In [88]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment,
    list_datasets, load_dataset, save_dataset
)
import ta_lib.eda.api as eda

In [90]:
import warnings

warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.", 
                        category=FutureWarning)

In [91]:
initialize_environment(debug=False, hide_warnings=True)

## Utility functions

# 1. Initialization


In [92]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
pprint(list_datasets(context))


['/raw/google',
 '/raw/product',
 '/raw/sales',
 '/raw/social_media',
 '/raw/Theme_list',
 '/raw/Theme_Product_list',
 '/cleaned/google',
 '/cleaned/product',
 '/cleaned/sales',
 '/cleaned/social_media',
 '/cleaned/Theme_list',
 '/cleaned/Theme_Product_list',
 '/processed/sales_data_processed',
 '/processed/social_media_data_processed',
 '/processed/google_search_data_processed',
 '/processed/sales_social_google_data_processed',
 '/train/features',
 '/train/target',
 '/test/features',
 '/test/target',
 '/score/output']


In [93]:
# load datasets
google_search_data = load_dataset(context, 'raw/google')
product_manufacturer_list = load_dataset(context, 'raw/product')
sales_data = load_dataset(context, 'raw/sales')
social_media_data = load_dataset(context, 'raw/social_media')
Theme_list = load_dataset(context, 'raw/Theme_list')
Theme_product_list = load_dataset(context, 'raw/Theme_Product_list')


# 2. Data cleaning and consolidation

**<u>NOTES</u>**

The focus here is to create a cleaned dataset that is appropriate for solving the DS problem at hand from the raw data.

**1. Do**
* clean dataframe column names
* Converting date columns to date time format
* ensure dtypes are set properly
* join with other tables etc to create features
* transform, if appropriate, datetime like columns to generate additional features (weekday etc)
* transform, if appropriate, string columns to generate additional features
* discard cols that are not useful for training the model (IDs, constant cols, duplicate cols etc)
* additional features generated from existing columns


**2. Don't**
* handle missing values or outliers here. mark them and leave them for processing downstream.


## 2.1 Clean Tables 

### Data Sources

From data discovery, we know the following

* all columns are strings : nothing to fix. Apply generic cleaning (strip extra whitespace etc)
* ensure all `invalid` string entries are mapped to np.NaN
* some column are duplicates. Better to `coalesce` them instead of an outright discard of one of the columns.
* Claim_id is key column : ensure no duplicate values
* This will go into production code


#### SOCIAL MEDIA DATA

In [94]:
social_media_data_clean = (
    social_media_data
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()
    # Dropping null values from the dataset
    .dropna()
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'published_date': 'date', 'Theme Id': 'Claim_Id'})
    # Convert argument to datetime.
    .to_datetime('date')
)
social_media_data_clean.head()

Unnamed: 0,Claim_Id,date,total_post
0,148.0,2015-10-01,76
1,148.0,2015-10-10,31
2,148.0,2015-10-11,65
3,148.0,2015-10-12,88
4,148.0,2015-10-13,85


#### SALES DATA TABLE

In [95]:
sales_data_clean = (
    sales_data
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'system_calendar_key_N': 'date'})

)
# Convert argument to datetime.
sales_data_clean['date']=pd.to_datetime(sales_data_clean['date'].astype(str), format='%Y-%m-%d')
sales_data_clean.head()


Unnamed: 0,date,product_id,sales_dollars_value,sales_units_value,sales_lbs_value
0,2016-01-09,1,13927.0,934,18680
1,2016-01-09,3,10289.0,1592,28646
2,2016-01-09,4,357.0,22,440
3,2016-01-09,6,23113.0,2027,81088
4,2016-01-09,7,23177.0,3231,58164


#### GOOGLE SEARCH TABLE

In [96]:
google_search_data_clean = (
    google_search_data
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()

    .drop(['year_new', 'week_number'], axis=1)
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'Claim_ID': 'Claim_Id'})
    # Convert argument to datetime.
    .to_datetime('date', format='%d-%m-%Y')
)
google_search_data_clean.head()

Unnamed: 0,date,platform,searchVolume,Claim_Id
0,2014-01-05,google,349,916
1,2014-01-06,google,349,916
2,2014-01-07,google,697,916
3,2014-01-10,google,349,916
4,2014-01-20,google,697,916


#### PRODUCT MANUFACTURE TABLE

In [97]:
product_manufacturer_list_clean = (
    product_manufacturer_list[['PRODUCT_ID','Vendor']]
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'PRODUCT_ID': 'product_id'})
)
product_manufacturer_list_clean.head()

Unnamed: 0,product_id,Vendor
0,1,Others
1,2,Others
2,3,Others
3,4,Others
4,5,Others


#### THEME PRODUCT LIST TABLE

In [98]:
theme_product_list_clean = (
    Theme_product_list
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'PRODUCT_ID': 'product_id', 'CLAIM_ID': 'Claim_Id'})
)
theme_product_list_clean.head()

Unnamed: 0,product_id,Claim_Id
0,26,8
1,29,8
2,48,81
3,50,81
4,74,227


#### THEME LIST TABLE

In [99]:
theme_list_clean = (
    Theme_list
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    # set dtypes : nothing to do here
    .passthrough()
    # .rename(columns = {'test':'TEST'}, inplace = True)
    .rename(columns = {'Claim Name': 'Claim_name', 'CLAIM_ID': 'Claim_Id'})
)
theme_list_clean.head()

Unnamed: 0,Claim_Id,Claim_name
0,0,No Claim
1,8,low carb
2,15,beans
3,16,cocoa
4,26,vanilla


### NOTE

It's always a good idea to save cleaned tabular data using a storage format that supports the following 

1. preserves the type information
2. language agnostic storage format
3. Supports compression
4. Supports customizing storage to optimize different data access patterns

For larger datasets, the last two points become crucial.

`Parquet` is one such file format that is very popular for storing tabular data. It has some nice properties:
- Similar to pickles & RDS datasets, but compatible with all languages
- Preserves the datatypes
- Compresses the data and reduces the filesize
- Good library support in Python and other languages
- As a columnar storage we can efficiently read fewer columns
- It also supports chunking data by groups of columns (for instance, by dates or a particular value of a key column) that makes loading subsets of the data fast.

In [100]:
save_dataset(context, google_search_data_clean, '/cleaned/google')
save_dataset(context, product_manufacturer_list_clean, '/cleaned/product')
save_dataset(context, sales_data_clean, '/cleaned/sales')
save_dataset(context, social_media_data_clean, '/cleaned/social_media')
save_dataset(context, theme_list_clean, '/cleaned/Theme_list')
save_dataset(context, theme_product_list_clean, '/cleaned/Theme_Product_list')


### Sales Table

From data discovery, we know the following

* key columns: None
* integer columns: product_id, sales_dollars_value, sales_units_value, sales_lbs_value
* datetime columns:  date
* This will go into production code

## 2.2 Create consolidated features table


Here we take the cleaned dataframes and merge them to form the consolidated table.

We know from data discovery that `sales_data` is a dimension table and `product_manufacturer_list` is a fact table, so we want to do a inner join here. * This will go into sales_product_data

In [101]:
sales_product_data = pd.merge(sales_data_clean, product_manufacturer_list_clean, how='inner', on='product_id')
merge_info(sales_data_clean, product_manufacturer_list_clean, sales_product_data)

Unnamed: 0,n_cols,n_rows
left_df,5,4526182
right_df,2,67175
merged_df,6,4526182


In [102]:
sales_product_data = pd.merge(sales_product_data, theme_product_list_clean, how='inner', on='product_id', validate='m:m')
merge_info(sales_product_data, theme_product_list_clean, sales_product_data)

Unnamed: 0,n_cols,n_rows
left_df,7,7767420
right_df,2,91485
merged_df,7,7767420


In [103]:
sales_product_data = pd.merge(sales_product_data, theme_list_clean, how='inner', on='Claim_Id', validate='m:1')
merge_info(sales_product_data, theme_list_clean, sales_product_data)

Unnamed: 0,n_cols,n_rows
left_df,8,7767420
right_df,2,208
merged_df,8,7767420


## 2.3 Business intuition features

This section can go into production code if these features are used in final model

#### Product id.
- Product id is the description of the product

In [104]:
# Dropping the Product ID 
sales_product_data2 = (
    sales_product_data
    .copy()
    .drop('product_id', axis=1)
)

In [105]:
# Calculating the unit price per unit for all of the vendors
sales_product_data2['per_unit_price'] = sales_product_data2['sales_dollars_value']/sales_product_data2['sales_units_value']

In [106]:
# Finding the unit price for every client
client_A_sales = sales_product_data2[sales_product_data2['Vendor'] == 'A'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'sales_dollars_value':'sum', 'sales_units_value':'sum', 'sales_lbs_value':'sum', 'per_unit_price':'mean'}).reset_index().rename(columns={'sales_dollars_value': 'client_A_sales_dollars_value', 'sales_units_value': 'Client_A_sales_units_value', 'sales_lbs_value': 'client_A_sales_lbs_value', 'per_unit_price': 'Client_A_sales_unit_price'})
client_B_sales = sales_product_data2[sales_product_data2['Vendor'] == 'B'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'B_sales_unit_price'})
client_E_sales = sales_product_data2[sales_product_data2['Vendor'] == 'E'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'E_sales_unit_price'})
client_D_sales = sales_product_data2[sales_product_data2['Vendor'] == 'D'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'D_sales_unit_price'})
client_F_sales = sales_product_data2[sales_product_data2['Vendor'] == 'F'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'F_sales_unit_price'})
client_G_sales = sales_product_data2[sales_product_data2['Vendor'] == 'G'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'G_sales_unit_price'})
client_H_sales = sales_product_data2[sales_product_data2['Vendor'] == 'H'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'H_sales_unit_price'})
client_Private_sales = sales_product_data2[sales_product_data2['Vendor'] == 'Private Label'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'Private_sales_unit_price'})
client_other_sales = sales_product_data2[sales_product_data2['Vendor'] == 'Others'].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'per_unit_price':'mean'}).reset_index().rename(columns={'per_unit_price': 'others_sales_unit_price'})

####  Merging the client unit price in our dataframe for comparison

In [107]:
A_B_sales = client_A_sales.merge(client_B_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')

In [108]:
A_B_E_sales = A_B_sales.merge(client_E_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_sales.shape)

(5774, 9)


In [109]:
A_B_E_D_sales = A_B_E_sales.merge(client_D_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_D_sales.shape)

(5774, 10)


In [110]:
A_B_E_D_F_sales = A_B_E_D_sales.merge(client_F_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_D_F_sales.shape)

(5774, 11)


In [111]:
A_B_E_D_F_G_sales = A_B_E_D_F_sales.merge(client_G_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_D_F_G_sales.shape)

(5774, 12)


In [112]:
A_B_E_D_F_G_H_sales = A_B_E_D_F_G_sales.merge(client_H_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_D_F_G_H_sales.shape)

(5774, 13)


In [113]:
A_B_E_D_F_G_H__Private_sales = A_B_E_D_F_G_H_sales.merge(client_Private_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(A_B_E_D_F_G_H__Private_sales.shape)

(5774, 14)


In [114]:
sales_data_processed = A_B_E_D_F_G_H__Private_sales.merge(client_other_sales, on=['date', 'Claim_Id', 'Claim_name'], how='left')
print(sales_data_processed.shape)

(5774, 15)


In [115]:
sales_data_processed.fillna(0, inplace=True)
sales_data_processed.head()

Unnamed: 0,date,Claim_Id,Claim_name,client_A_sales_dollars_value,Client_A_sales_units_value,client_A_sales_lbs_value,Client_A_sales_unit_price,B_sales_unit_price,E_sales_unit_price,D_sales_unit_price,F_sales_unit_price,G_sales_unit_price,H_sales_unit_price,Private_sales_unit_price,others_sales_unit_price
0,2016-01-09,0,No Claim,132979903.0,57169562,136475132,7.048409,9.858631,11.727692,7.057042,16.829154,11.986452,13.335306,5.557718,10.752635
1,2016-01-09,8,low carb,8853853.0,1952575,2947044,11.837333,16.159491,0.0,14.633429,18.921083,0.0,13.990691,11.899578,15.345391
2,2016-01-09,15,beans,6015.0,326,1793,18.45092,0.0,0.0,51.829144,0.0,0.0,0.0,11.006135,21.444175
3,2016-01-09,32,stroganoff,7482235.0,1168180,3642177,17.630763,21.507496,0.0,15.876835,19.190919,0.0,20.359431,0.0,16.407875
4,2016-01-09,39,high/source of protein,69542.0,19753,21040,11.576619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.46238


In [116]:
social_media_data_processed = social_media_data_clean.merge(theme_list_clean, on='Claim_Id')
social_media_data_processed

Unnamed: 0,Claim_Id,date,total_post,Claim_name
0,148.0,2015-10-01,76,tuna
1,148.0,2015-10-10,31,tuna
2,148.0,2015-10-11,65,tuna
3,148.0,2015-10-12,88,tuna
4,148.0,2015-10-13,85,tuna
...,...,...,...,...
314874,876.0,2019-09-05,4658,caramel
314875,876.0,2019-09-06,3731,caramel
314876,876.0,2019-09-07,2336,caramel
314877,876.0,2019-09-08,1374,caramel


In [117]:
social_media_data_processed.set_index('date', inplace=True)

#### Making the claim Id, Name as unique features
- features represent the unique identification of the product.

In [118]:
#### Unique Claim_name
social_media_data_processed = social_media_data_processed.groupby(['Claim_Id','Claim_name',
                                                pd.Grouper(freq='W-SAT', closed='right', label='right')]).sum()
social_media_data_processed = social_media_data_processed.reset_index()
social_media_data_processed

Unnamed: 0,Claim_Id,Claim_name,date,total_post
0,8.0,low carb,2015-05-23,307
1,8.0,low carb,2015-05-30,923
2,8.0,low carb,2015-06-06,1705
3,8.0,low carb,2015-06-13,1089
4,8.0,low carb,2015-06-20,913
...,...,...,...,...
44738,999.0,oral health,2019-10-05,146
44739,999.0,oral health,2019-10-12,166
44740,999.0,oral health,2019-10-19,139
44741,999.0,oral health,2019-10-26,131


In [119]:
google_search_data2 = google_search_data_clean.merge(theme_list_clean, on='Claim_Id')

In [120]:
google_search_data2.set_index('date', inplace=True)
df_weekly_g = google_search_data2.groupby(['platform', 'Claim_Id', 'Claim_name']).resample('W-SAT').sum()
df_weekly_g.drop('Claim_Id', axis=1, inplace=True)
df_weekly_g = df_weekly_g.reset_index()
df_weekly_g

Unnamed: 0,platform,Claim_Id,Claim_name,date,searchVolume
0,amazon,8,low carb,2018-01-06,12042
1,amazon,8,low carb,2018-01-13,12631
2,amazon,8,low carb,2018-01-20,12544
3,amazon,8,low carb,2018-01-27,12337
4,amazon,8,low carb,2018-02-03,12667
...,...,...,...,...,...
57843,walmart,980,honey,2019-08-31,126
57844,walmart,980,honey,2019-09-07,209
57845,walmart,980,honey,2019-09-14,84
57846,walmart,980,honey,2019-09-21,209


##### Adding all the search platform in the dataset

In [121]:
amazon_searches = df_weekly_g[df_weekly_g['platform'] == "amazon"].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'searchVolume':'sum'}).reset_index().rename(columns={'searchVolume': 'amazon_searchVolume'})
chewy_searches = df_weekly_g[df_weekly_g['platform'] == "chewy"].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'searchVolume':'sum'}).reset_index().rename(columns={'searchVolume': 'chewy_searchVolume'})
google_searches = df_weekly_g[df_weekly_g['platform'] == "google"].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'searchVolume':'sum'}).reset_index().rename(columns={'searchVolume': 'google_searchVolume'})
walmart_searches = df_weekly_g[df_weekly_g['platform'] == "walmart"].groupby(['date', 'Claim_Id', 'Claim_name']).agg({'searchVolume':'sum'}).reset_index().rename(columns={'searchVolume': 'walmart_searchVolume'})

In [122]:
am_ch = amazon_searches.merge(chewy_searches, on=['date', 'Claim_Id', 'Claim_name'], how='outer')
am_ch_go = am_ch.merge(google_searches, on=['date', 'Claim_Id', 'Claim_name'], how='outer')
google_search_data_processed = am_ch_go.merge(walmart_searches, on=['date', 'Claim_Id', 'Claim_name'], how='outer')
google_search_data_processed.fillna(0, inplace=True)
google_search_data_processed

Unnamed: 0,date,Claim_Id,Claim_name,amazon_searchVolume,chewy_searchVolume,google_searchVolume,walmart_searchVolume
0,2018-01-06,8,low carb,12042.0,4931.0,54624.0,978.0
1,2018-01-06,39,high/source of protein,89.0,60.0,0.0,0.0
2,2018-01-06,65,ethical - not specific,60.0,0.0,2119.0,0.0
3,2018-01-06,75,french bisque,712.0,178.0,16962.0,297.0
4,2018-01-06,152,low sugar,418.0,0.0,18804.0,0.0
...,...,...,...,...,...,...,...
39979,2019-10-05,903,omega-3,0.0,0.0,149.0,0.0
39980,2019-10-05,907,dha,0.0,0.0,21403.0,0.0
39981,2019-10-05,948,sole,0.0,0.0,73.0,0.0
39982,2019-10-05,979,added iron,0.0,0.0,373.0,0.0


#### We have final 3 dataset with the name of sales_data_processed, Social_media_processed and google_theme_processed, we are merging to together as our final dataset for training the model.

In [123]:
sales_social_data_processed = sales_data_processed.merge(social_media_data_processed,
                          on = ['date','Claim_Id', 'Claim_name'],
                          how='inner')
sales_social_data_processed.shape

(3789, 16)

In [124]:
sales_social_google_data_processed = sales_social_data_processed.merge(google_search_data_processed,
                          on = ['date','Claim_Id', 'Claim_name'],
                          how='inner')
sales_social_google_data_processed.drop('date',axis=1,inplace= True)
sales_social_google_data_processed.shape

(2966, 19)

In [125]:
top_sales = (
    sales_social_google_data_processed
    .groupby(['Claim_name']).agg({'client_A_sales_dollars_value':'sum'}).reset_index()
    .sort_values(by=['client_A_sales_dollars_value'], ascending=False)
    .head(3)
)
sales_social_google_data_processed=sales_social_google_data_processed[sales_social_google_data_processed['Claim_name'].isin(top_sales['Claim_name'])]
sales_social_google_data_processed.shape

(582, 19)

In [126]:
# Any verifications on the data
from ta_lib.eda.api import get_variable_summary
display_as_tabs([
    ("Summary", f"Length: {len(sales_social_google_data_processed)}, Columns: {len(sales_social_google_data_processed.columns)}"),
    ("Variable summary", get_variable_summary(sales_social_google_data_processed)),
    ("head", sales_social_google_data_processed.head(5).T),
    ("tail", sales_social_google_data_processed.tail(5).T),
])

In [127]:
save_dataset(context, sales_social_google_data_processed, 'processed/sales_social_google_data_processed')

# 3. Generate Train, Validation and Test datasets



- We split the data into train, test (optionally, also a validation dataset)
- In this example, we are binning the target into 10 quantiles and then use a Stratified Shuffle to split the data.
- See sklearn documentation on the various available splitters
- https://scikit-learn.org/stable/modules/classes.html#splitter-classes
- This will go into production code (training only)

In [128]:
from sklearn.model_selection import StratifiedShuffleSplit
from ta_lib.core.api import custom_train_test_split # helper function to customize splitting
from scripts import *

sales_df_train, sales_df_test = train_test_split(sales_social_google_data_processed, test_size=0.1, random_state=42)
target_col = "Client_A_sales_units_value"

In [129]:
train_X, train_y = (
    sales_df_train
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, train_X, 'train/features')
save_dataset(context, train_y, 'train/target')

test_X, test_y = (
    sales_df_test
    # split the dataset to train and test
    .get_features_targets(target_column_names=target_col)
)
save_dataset(context, test_X, 'test/features')
save_dataset(context, test_y, 'test/target')
