In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery import Transition

# Set the environment working path as the root of the Jupyter instance
os.environ['DSTU_WORK_PATH'] = Path(os.environ['PWD']).as_posix()


# Accelerated Machine learning
## Transitioning Contract

In [2]:
# create or retrieve the named Transition instance
tr = Transition('synthetic_agent')

#### Reset the Contract

In [3]:
tr.reset_transition_contracts()

### Data Source

In [4]:
# set the source contract
tr.set_source_contract(resource='synthetic_agent.csv', sep=',', encoding='latin1', load=False)

### Retrieve & Observations

In [5]:
# load the source canonical
df = tr.load_source_canonical()

In [6]:
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,agent,object,0.0%,5.4%,10000,40,Sample: Cleo | Emily | Alexi
1,call_date,object,0.0%,0.0%,10000,9850,Sample: 10-25-2018 19:55:21 | 10-25-2018 10:15:19 | 10-16-2018 13:48:54
2,call_id,int64,0.0%,0.0%,10000,10000,max=9999492 | min=1000583 | mean=5509490.32
3,complaint,object,0.0%,16.5%,10000,29,Sample: Policy amendments not carried out | You did not explain the terms & conditions | You did not...
4,contact,object,0.0%,54.2%,10000,13,Sample: Fax | Retail Voice | Third Party Call
5,customer_id,object,0.0%,0.4%,10000,498,Sample: CU_8399937 | CU_9367831 | CU_1876085
6,duration,object,0.0%,0.3%,10000,1406,Sample: 12:19 | 14:57 | 15:36
7,escalated,int64,0.0%,95.3%,10000,2,max=1 | min=0 | mean=0.05
8,referred,int64,0.0%,95.1%,10000,2,max=1 | min=0 | mean=0.05
9,stat_0,float64,97.5%,94.0%,250,2,max=1.0 | min=0.0 | mean=0.06


In [7]:
tr.add_notes(text='The file is a synthetic agent data file created for this demonstration')

In [8]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.add_notes(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [9]:
tr.add_notes(label='stat*', text="are noise within the file", note_type='attribute')
tr.add_attribute_notes(attribute='customer_id', text="This is a weighted list of customers taken from the synthetic customer file")
tr.add_attribute_notes(attribute='customer_id', text="the two files can be joined by this key")

In [10]:
tr.report_notes(stylise=True)

Unnamed: 0,section,label,date,text
0,overview,notes,2019-06-25 14:20,The file is a synthetic agent data file created for this demonstration
1,,source,2019-06-25 14:20,This was generated using the Discovery Behavioral Synthetic Data Generator
2,,,2019-06-25 14:20,The script to rerun the data generation can be found in the synthetic scripts folder
3,attribute,customer_id,2019-06-25 14:20,This is a weighted list of customers taken from the synthetic customer file
4,,,2019-06-25 14:20,the two files can be joined by this key
5,,stat*,2019-06-25 14:20,are noise within the file


------------
### Selection, Filter and Typing

In [11]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, inplace=True))

In [12]:
# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.98, predominant_max=0.99,inplace=True, nulls_list=['']))

In [13]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=45, null_max=0.7, inplace=True))

In [14]:
# to remove
tr.set_cleaner(tr.clean.to_remove(df, regex=[r'stat'], inplace=True))

In [15]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers=['call_date', 'duration'], day_first=False, inplace=True))

In [16]:
# Type String
tr.set_cleaner(tr.clean.to_str_type(df, headers=['call_id', 'customer_id'], nulls_list=[''], inplace=True))

In [17]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,replace_spaces=_
1,,auto_remove_columns,"null_min=0.98, predominant_max=0.99, nulls_list=['']"
2,,auto_to_category,"null_max=0.7, unique_max=45"
3,,to_date_type,"headers=['call_date', 'duration'], drop=False, as_num=False, day_first=False, year_first=False"
4,,to_remove,"regex=['stat'], re_ignore_case=False"
5,,to_str_type,"headers=['call_id', 'customer_id'], drop=False, nulls_list=['']"


### Report, Persist and Validation

In [18]:
# Create the excel data dictionary
tr.canonical_report(df, to_file=True)

# save the clean file
tr.save_clean_canonical(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,agent,category,0.0%,5.4%,10000,40,Abbie |Alesha |Alexi |Angie|Anita|April|April |Bonnie |Carly|Carolina|Casey|Cleo|Daphne |Daria |Efa ...
1,call_date,datetime64[ns],0.0%,0.0%,10000,9850,max=2018-10-26 22:59:46 | min=2018-10-01 06:00:03 | yr mean= 2018
2,call_id,object,0.0%,0.0%,10000,10000,Sample: 3842386 | 2354642 | 9520984
3,complaint,category,0.0%,16.5%,10000,29,All points not addressed|Customer payment processed incorrectly|FA advice queried|Fund Performance -...
4,contact,category,0.0%,54.2%,10000,13,Account manager|E-mail|E-mail & Phone Call|Fax|Internet|Letter|Letter & Phone Call|MyPortal|Phone Ca...
5,customer_id,object,0.0%,0.4%,10000,498,Sample: CU_9131158 | CU_9816363 | CU_7641905
6,duration,datetime64[ns],1.5%,0.3%,9848,1289,max=2019-06-25 23:59:00 | min=2019-06-25 00:00:00 | yr mean= 2019
7,escalated,int64,0.0%,95.3%,10000,2,max=1 | min=0 | mean=0.05
8,referred,int64,0.0%,95.1%,10000,2,max=1 | min=0 | mean=0.05


In [19]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,agent,category,0.0%,5.4%,10000,40,Abbie |Alesha |Alexi |Angie|Anita|April|April |Bonnie |Carly|Carolina|Casey|Cleo|Daphne |Daria |Efa ...
1,call_date,datetime64[ns],0.0%,0.0%,10000,9850,max=2018-10-26 22:59:46 | min=2018-10-01 06:00:03 | yr mean= 2018
2,call_id,object,0.0%,0.0%,10000,10000,Sample: 5367171 | 9093362 | 7690846
3,complaint,category,0.0%,16.5%,10000,29,All points not addressed|Customer payment processed incorrectly|FA advice queried|Fund Performance -...
4,contact,category,0.0%,54.2%,10000,13,Account manager|E-mail|E-mail & Phone Call|Fax|Internet|Letter|Letter & Phone Call|MyPortal|Phone Ca...
5,customer_id,object,0.0%,0.4%,10000,498,Sample: CU_9541147 | CU_6707408 | CU_7766811
6,duration,datetime64[ns],1.5%,0.3%,9848,1289,max=2019-06-25 23:59:00 | min=2019-06-25 00:00:00 | yr mean= 2019
7,escalated,int64,0.0%,95.3%,10000,2,max=1 | min=0 | mean=0.05
8,referred,int64,0.0%,95.1%,10000,2,max=1 | min=0 | mean=0.05


---------
### Re-Running the Pipeline Contract

In [20]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,agent,category,0.0%,5.4%,10000,40,Abbie |Alesha |Alexi |Angie|Anita|April|April |Bonnie |Carly|Carolina|Casey|Cleo|Daphne |Daria |Efa ...
1,call_date,datetime64[ns],0.0%,0.0%,10000,9850,max=2018-10-26 22:59:46 | min=2018-10-01 06:00:03 | yr mean= 2018
2,call_id,object,0.0%,0.0%,10000,10000,Sample: 4244082 | 7385962 | 4878807
3,complaint,category,0.0%,16.5%,10000,29,All points not addressed|Customer payment processed incorrectly|FA advice queried|Fund Performance -...
4,contact,category,0.0%,54.2%,10000,13,Account manager|E-mail|E-mail & Phone Call|Fax|Internet|Letter|Letter & Phone Call|MyPortal|Phone Ca...
5,customer_id,object,0.0%,0.4%,10000,498,Sample: CU_3357930 | CU_5778616 | CU_9830948
6,duration,datetime64[ns],1.5%,0.3%,9848,1289,max=2019-06-25 23:59:00 | min=2019-06-25 00:00:00 | yr mean= 2019
7,escalated,int64,0.0%,95.3%,10000,2,max=1 | min=0 | mean=0.05
8,referred,int64,0.0%,95.1%,10000,2,max=1 | min=0 | mean=0.05
