In [1]:
%run ../base_setup.ipynb

Foundation: 1.02.030
Transition: 2.03.076


# Accelerated Machine learning
## Transitioning Contract

In [2]:
# create or retrieve the named Transition instance
tr = Transition.from_env('synthetic_customer')

#### Reset the Contract

In [94]:
tr.reset_transition_contracts()

### Data Source

In [95]:
# # Alternative S3
# tr.set_source_contract(resource='data/synthetic/synthetic_customer.csv', connector_type='dsv',
#                        location='ds-discovery', module_name='ds_connectors.handlers.aws_s3_handlers', handler='S3SourceHandler',
#                        delimiter=',', encoding='latin1')
# tr.set_persist_contract()

In [96]:
# set the source contract
tr.set_source_contract(resource='synthetic_agent.csv', connector_type='csv', location=os.environ['DTU_ORIGIN_PATH'], 
                       module_name=tr.MODULE_NAME, handler=tr.SOURCE_HANDLER, sep=',', encoding='latin1')

In [97]:
tr.report_source()

Unnamed: 0,param,Property Source,Data Source
0,connector_name,pm_data_synthetic_customer,origin_connector
1,resource,config_transition_data_synthetic_customer.yaml,synthetic_agent.csv
2,connector_type,yaml,csv
3,location,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/contracts/synthetic_customer,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/data/0_raw
4,module_name,ds_discovery.handlers.pandas_handlers,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler,PandasSourceHandler
6,modified,0,0
7,kwargs,,"sep=',' encoding='latin1'"


In [5]:
tr.MODULE_NAME
tr.HANDLER_SOURCE

'ds_discovery.handlers.pandas_handlers'

'PandasSourceHandler'

### Retrieve & Observations

In [98]:
tr.data_pm.get_connector_handler('origin_connector').supported_types()

['parquet', 'csv', 'tsv', 'txt', 'json', 'pickle', 'xlsx']

In [99]:
# load the source canonical
df = tr.load_source_canonical()

In [113]:
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,agent,object,0.0%,5.4%,10000,40,Sample: Darcie | Amalia | Antoinette
1,call_date,object,0.0%,0.0%,10000,9842,Sample: 10-16-2018 07:48:57 | 10-16-2018 08:24:49 | 10-24-2018 10:01:43
2,call_id,int64,0.0%,0.0%,10000,10000,max=9999362 | min=1000157 | mean=5478034.44
3,complaint,object,0.0%,16.0%,10000,29,Sample: The performance of my product was poor | You were not helpful | Unhappy with delay
4,contact,category,0.0%,53.2%,10000,13,Account manager|E-mail|E-mail & Phone Call|Fax|Internet|Letter|Letter & Phone Call|MyPortal|Phone Ca...
5,customer_id,object,0.0%,0.4%,10000,499,Sample: CU_9745282 | CU_8563403 | CU_9949317
6,duration,object,0.0%,0.3%,10000,1384,Sample: 13:47 | 00:34 | 05:30


In [101]:
tr.add_notes(text='The file is a synthetic customer data file created for this demonstration')

In [102]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.add_notes(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [103]:
tr.add_attribute_notes(attribute='start', text="changing this to start_date so it being a date is obvious")

ValueError: The label 'start' is not in the selection list ['call_id', 'customer_id', 'call_date', 'contact', 'complaint', 'agent', 'duration', 'escalated', 'referred', 'stat_0', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 'stat_6', 'stat_7', 'stat_8', 'stat_9', 'stat_10', 'stat_11', 'stat_12', 'stat_13', 'stat_14', 'stat_15', 'stat_16', 'stat_17', 'stat_18', 'stat_19', 'stat_20', 'stat_21', 'stat_22', 'stat_23', 'stat_24', 'stat_25', 'stat_26', 'stat_27', 'stat_28', 'stat_29', 'stat_30', 'stat_31', 'stat_32', 'stat_33', 'stat_34', 'stat_35', 'stat_36', 'stat_37', 'stat_38', 'stat_39']

In [48]:
tr.report_notes(regex='With')

Unnamed: 0,section,label,date,text
0,attribute,auto_remove,2019-10-03 09:45,"With >90% predominance removes ['single_cat', 'single_num', 'weight_cat', 'weight_num']"
1,,,2019-10-03 09:45,With >99% nulls removes ['null']


------------
### Selection, Filter and Typing

In [104]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [105]:
# dynamically capture what will be removed through the predominant filter
report = tr.canonical_report(df, stylise=False, report_header='%_Dom', condition=">0.90")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >90% predominance removes {}'.format(report))

# dynamically capture what will be removed through the nulls filter
report = tr.canonical_report(df, stylise=False, report_header='%_Null', condition=">0.99")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >99% nulls removes {}'.format(report))

# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90,inplace=True, nulls_list=['']))

In [106]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [107]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [108]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', day_first=True, inplace=True))

In [109]:
# Type boolean
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [110]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

In [111]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [112]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,"rename_map={'start': 'start_date'}, replace_spaces=_"
1,,auto_remove_columns,"null_min=0.99, predominant_max=0.9, nulls_list=['']"
2,,auto_to_category,"null_max=0.7, unique_max=20"
3,,to_bool_type,"headers=online, drop=False, bool_map={1: True}"
4,,to_category_type,"headers=['gender', 'profession'], drop=False"
5,,to_date_type,"headers=start_date, drop=False, as_num=False, day_first=True, year_first=False"
6,,to_float_type,"dtype=['float'], exclude=False, fillna=nan, errors=coerce, precision=3"
7,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['', 'nan']"


In [87]:
tr.load_clean_canonical()

### Bulk upload the Attribute Dictionary

In [58]:
# load the Transitioning instance
tr_dict = Transition.from_env('synthetic_customer_dictionary')


In [59]:
tr_dict.report_source()

Unnamed: 0,param,Property Source,Persist Source
0,connector_name,pm_data_synthetic_customer_dictionary,persist_connector
1,resource,config_transition_data_synthetic_customer_dictionary.yaml,transition_synthetic_customer_dictionary_v0.00.pickle
2,connector_type,yaml,pickle
3,location,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/contracts/synthetic_customer_dictionary,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/contracts/synthetic_customer_dictionary
4,module_name,ds_discovery.handlers.pandas_handlers,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler,PandasPersistHandler
6,modified,0,0
7,kwargs,,


In [28]:
# get the Transitioned Canonical
df_cust_dict = tr_dict.load_clean_canonical()
tr.canonical_report(df_cust_dict, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations


In [29]:
# bulk upload the dictionary into our notes
tr.upload_notes(df_cust_dict, label_header='attribute', text_header='description', note_type='dictionary', selection=df.columns.to_list())

ValueError: The label header attribute can't be found in the DataFrame

In [None]:
tr.report_notes(stylise=True)

### Report, Persist and Validation

In [30]:

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist_contract()

# save the clean file
tr.save_clean_canonical(df)

In [33]:
?tr.discover.analyse_association

[0;31mSignature:[0m [0mtr[0m[0;34m.[0m[0mdiscover[0m[0;34m.[0m[0manalyse_association[0m[0;34m([0m[0mdf[0m[0;34m:[0m[0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mcolumns_list[0m[0;34m:[0m[0mlist[0m[0;34m,[0m [0mexclude_associate[0m[0;34m:[0m[0mlist[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Analyses the association of Category against Values and returns a dictionary of resulting weighting
the structure of the columns_list is a list of dictionaries with the key words
    - label: the label or name of the header in the DataFrame
    - dtype: one of category|number|date indicating the origin of the data
    - chunk_size: if the weighting pattern is over the size of the data the number of chunks
    - replace_zero: if a zero reference is returned it can optionally be replaced with a low probability
and example structure might look like:
    [{'label1':

In [31]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,object,15.0%,0.5%,425,422,Sample: 45.393 | 79.16 | 57.258
1,balance,object,0.0%,0.4%,500,496,Sample: 100.81 | 133.6 | 119.28
2,forename,object,0.0%,0.4%,500,499,Sample: Jonas | Habiba | Iva
3,gender,category,0.0%,62.2%,500,2,F|M
4,id,object,0.0%,0.2%,500,500,Sample: CU_2313118 | CU_5601884 | CU_2349662
5,last_login,object,0.0%,0.2%,500,500,Sample: 01-22-19 16:36 | 03-29-19 06:23 | 04-17-19 20:00
6,online,bool,0.0%,100.0%,500,1,False
7,profession,category,10.0%,24.2%,500,16,|Actuary|Assistant Manager|Assistant Professor|Dental Hygienist|Design Engineer|Librarian|Nurse Prac...
8,start_date,datetime64[ns],0.0%,1.4%,500,271,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
9,status,category,0.0%,54.6%,500,4,Active|Closed|Pending|Suspended


---------
### Re-Running the Pipeline Contract

In [60]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df)

ModuleNotFoundError: The connector 'persist_connector' has not been set in the property manager