## Read Data

In [1]:
import pandas as pd
fp = "../../data/online_retail_Q1_2010.parquet"
df = pd.read_parquet(fp)

In [2]:
df.sample(n=5)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
8275,494280,20725,LUNCH BAG RED SPOTTY,10,2010-01-13 10:43:00,1.65,12682.0,France
31970,496429,21042,RED SPOTTY APRON,2,2010-02-01 12:18:00,5.95,14527.0,United Kingdom
8342,494336,85032A,ROMANTIC IMAGES GIFT WRAP SET,144,2010-01-13 12:30:00,0.65,14137.0,United Kingdom
66623,500098,85206A,CREAM FELT EASTER EGG BASKET,3,2010-03-04 12:32:00,1.65,16771.0,United Kingdom
61836,499639,37441,ASSORTED WHITE EMBOSSED CHINA MUGS,6,2010-03-01 13:51:00,2.1,16519.0,United Kingdom


In [3]:
def valid_date_time(x):
    try:
        pd.to_datetime(x)
        return True
    except Exception as e:
        return False
valid_datetime = df["InvoiceDate"].apply(valid_date_time)

In [4]:
df = df[valid_datetime]

In [5]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

## Identify the columns with mixed types

In [6]:
cols = df.columns.tolist()
[cols[i] for i in [3,5,6]]

['Quantity', 'Price', 'Customer ID']

## Noise Filter #1 Definition

In [7]:
valid_cust = ~df["Customer ID"].isna()
valid_desc = ~df["Description"].isna()
valid_stock_code = ~df["Description"].isna()

## Apply Noise Filter # 1

In [8]:
noise_filter_1 = valid_cust & valid_desc
df = df[noise_filter_1].reset_index(drop=True)

## Noise Filter #2 Definition

In [9]:
def good_quantity_record_check(x):
    try:
        f_x = float(x)
        if f_x > 0:
            return True
        else:
            return False # returned purchase
    except :
        return False


## Apply Noise Filter #2

In [10]:
good_quantity_records = df["Quantity"].apply(good_quantity_record_check)
df = df[good_quantity_records]
df = df.reset_index(drop=True)

## Noise Filter #3 Definition

In [11]:
return_or_bank_charges = (df.Price == "BANK CHARGES") | (df.Price == "ADJUST")
valid_purchases = ~ return_or_bank_charges

## Apply Noise Filter #3

In [12]:
df = df[valid_purchases]
df = df.reset_index(drop=True)

## Noise Filter #4 Definition

In [13]:
def not_test_product(x):
    if "TEST" in x:
        return False
    elif x in ["ADJUST", "BANK CHARGES", "C2", "M"]:
        return False
    else:
        return True

## Apply Noise Filter #4

In [14]:
valid_products = df["StockCode"].apply(not_test_product)
df = df[valid_products]
df = df.reset_index(drop=True)

In [15]:
cols

['Invoice',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'Price',
 'Customer ID',
 'Country']

## Define Noise Filter #5

In [16]:

attr_types = {"Invoice": str, "StockCode": str, "Description": str,\
             "Quantity": float, "InvoiceDate": 'datetime64[ns]', "Price": float,\
             "Customer ID": str, "Country": str}
df = df.astype(attr_types)
Q1_2010 = (df["InvoiceDate"].dt.year == 2010) & (df["InvoiceDate"].dt.quarter == 1)

## Apply Noise Filter #5

In [17]:
df = df[Q1_2010]
df = df.reset_index(drop=True)

## Save Prepared Data to Disk

In [18]:
fp = "../../data/retail_q1_post_EDA.parquet"
df.to_parquet(fp, index=False)

## KMDS Logging of EDA Observations

In [19]:
from ontology.kmds_ontology import *
from tagging.tag_types import ExploratoryTags

In [20]:
kaw = KnowledgeExtractionExperimentationWorkflow("retail_customer_modelling")

In [21]:
exp_obs_list = []
observation_count = 1
e1 = ExploratoryObservation()

In [22]:
e1.finding = "Quantity, Price and Customer ID attributes have mixed types"
e1.finding_seq = observation_count
e1.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e1)

In [23]:
observation_count += 1

In [24]:
e2 = ExploratoryObservation()
e2.finding = "Customer ID, Description and Stock Code have null values"
e2.finding_seq = observation_count
e2.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e2)

In [25]:
observation_count += 1
e3 = ExploratoryObservation()
e3.finding = "Some Quantities are not numbers"
e3.finding_seq = observation_count
e3.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e3)

In [26]:
observation_count += 1
e4 = ExploratoryObservation()
e4.finding = "Some Prices are not numbers, these correspond to transactions that are some kind of adjustment - like returns, or,\
denoting some kind of payment adjustment. "
e4.finding_seq = observation_count
e4.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e4)

In [27]:
observation_count += 1
e5 = ExploratoryObservation()
e5.finding = "Some products are not valid products. For example, some of these products seem to be created by software testers "
e5.finding_seq = observation_count
e5.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e5)

In [28]:
observation_count += 1
e6 = ExploratoryObservation()
e6.finding = "For this task, the period considered is first quarter of 2010, other data are not relevant for this report "
e6.finding_seq = observation_count
e6.exploratory_observation_type = ExploratoryTags.RELEVANCE_OBSERVATION.value
exp_obs_list.append(e6)

In [29]:
kaw.has_exploratory_observations = exp_obs_list

In [30]:
from owlready2 import *
from utils.path_utils import get_kb_file_path
onto = get_ontology(get_ontology_path()).load()
onto.save(file=get_kb_file_path(file_name="example_ml_kb_exp_workflow"), format="rdfxml")