## Read cleaned (denoised) data, post EDA

In [1]:
import pandas as pd
fp = "../../data/retail_q1_post_EDA.parquet"
df = pd.read_parquet(fp)

## Develop daily sales data for store inventory
The daily sales data for the store inventory for Q1 was generated by computing sales for each inventory item for each business day of Q1

In [2]:
df["item_total"] = df["Quantity"] * df["Price"]
dsbysc = df.groupby([df.InvoiceDate.dt.day_of_year, df.StockCode])
dsbysc = dsbysc["item_total"].sum().to_frame().reset_index()
dfQ1_PA = dsbysc.pivot(index="InvoiceDate", columns="StockCode", values="item_total").fillna(0)

## Write the transformed data representation for use in modelling

In [3]:
fp = "../../data/retail_q1_post_data_rep_prep.parquet"
dfQ1_PA.to_parquet(fp, index=False)

## Log the data transformation information to KMDS

In [4]:
from ontology.kmds_ontology import *
from tagging.tag_types import DataRepresentationTags
from owlready2 import *
from utils.load_utils import *
KNOWLEDGE_BASE = "example_ml_kb_exp_workflow"
onto = load_kb(KNOWLEDGE_BASE)

In [5]:
dr_obs_list = []
observation_count = 1
with onto: # the with onto key word implies we are updating the ontology from the eda phase that we loaded.
    dr1 = DataRepresentationObservation()
    dr1.finding = "The daily sales data for the store inventory for Q1 was generated by computing sales\
    for each inventory item for each business day of Q1"
    dr1.finding_seq = observation_count
    dr1.data_representation_observation_type = DataRepresentationTags.DATA_TRANSFORMATION_OBSERVATION.value
    dr_obs_list.append(dr1)


    the_workflow = get_workflow(onto)
    the_workflow.has_data_representation_observations = dr_obs_list
    onto.save(file=get_kb_file_path(file_name=KNOWLEDGE_BASE), format="rdfxml")