In [1]:
%run ../base_setup.ipynb

Foundation: 1.02.015
Transition: 2.03.026


# Accelerated Machine learning
## Transitioning Contract

In [2]:
# create or retrieve the named Transition instance
tr = Transition.from_env('synthetic_customer')

#### Reset the Contract

In [3]:
tr.reset_transition_contracts()

### Data Source

In [4]:
# Alternative S3
tr.set_source_contract(resource='data/synthetic/synthetic_customer.csv', connector_type='dsv',
                       location='ds-discovery', module_name='ds_connectors.handlers.aws_s3_handlers', handler='S3SourceHandler',
                       delimiter=',', encoding='latin1')
tr.set_persist_contract()

In [5]:
# set the source contract
# tr.set_source_contract(resource='synthetic_customer.dsv', sep=',', encoding='latin1')
# tr.set_persist_contract()

In [6]:
tr.report_source()

Unnamed: 0,param,Property Source,Data Source,Persist Source
0,connector_name,pm_data_synthetic_customer,origin_connector,persist_connector
1,resource,config_data_synthetic_customer.yaml,data/synthetic/synthetic_customer.csv,transition_synthetic_customer_v0.00.pkl
2,connector_type,yaml,dsv,pickle
3,location,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/contracts/synthetic_customer,ds-discovery,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/data/1_transition
4,module_name,ds_discovery.handlers.pandas_handlers,ds_connectors.handlers.aws_s3_handlers,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler,S3SourceHandler,PandasPersistHandler
6,modified,0,0,0
7,kwargs,,"delimiter=',' encoding='latin1'",


### Retrieve & Observations

In [7]:
tr.data_pm.get_connector_handler('origin_connector').supported_types()

['csv', 'dsv', 'pickle', 'parquet']

In [8]:
# load the source canonical
df = tr.load_source_canonical()

In [9]:
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,object,15.0%,0.5%,500,423,Sample: 53.982 | 57.654 | 31.171
1,balance,object,0.0%,0.4%,500,496,Sample: 169.2 | 185.51 | 98.23
2,forename,object,0.0%,0.4%,500,499,Sample: Kyron | Rizwan | Muhammad
3,gender,object,0.0%,62.2%,500,2,Sample: F | M
4,id,object,0.0%,0.2%,500,500,Sample: CU_3823673 | CU_6688783 | CU_3008404
5,last_login,object,0.0%,0.2%,500,500,Sample: 04-15-19 10:26 | 04-14-19 08:29 | 04-08-19 06:54
6,,object,100.0%,0.0%,500,1,Sample:
7,online,object,0.0%,78.8%,500,2,Sample: 0 | 1
8,profession,object,10.0%,24.2%,500,16,Sample: Recruiter | Assistant Professor | Dental Hygienist
9,single cat,object,40.0%,100.0%,500,2,Sample: A |


In [10]:
tr.add_notes(text='The file is a synthetic customer data file created for this demonstration')

In [11]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.add_notes(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [12]:
tr.add_attribute_notes(attribute='start', text="changing this to start_date so it being a date is obvious")

In [17]:
tr.report_notes(regex='With')

Unnamed: 0,section,label,date,text
0,attribute,auto_remove,2019-08-20 18:08,"With >90% predominance removes ['single_cat', 'single_num', 'weight_cat', 'weight_num']"
1,,,2019-08-20 18:08,With >99% nulls removes ['null']
2,,start,2019-08-20 18:08,changing this to start_date so it being a date is obvious


------------
### Selection, Filter and Typing

In [11]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [12]:
# dynamically capture what will be removed through the predominant filter
report = tr.canonical_report(df, stylise=False, report_header='%_Dom', condition=">0.90")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >90% predominance removes {}'.format(report))

# dynamically capture what will be removed through the nulls filter
report = tr.canonical_report(df, stylise=False, report_header='%_Null', condition=">0.99")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >99% nulls removes {}'.format(report))

# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90,inplace=True, nulls_list=['']))

In [13]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [14]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [15]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', day_first=True, inplace=True))

In [16]:
# Type boolean
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [17]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

In [18]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [19]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,"rename_map={'start': 'start_date'}, replace_spaces=_"
1,,auto_remove_columns,"null_min=0.99, predominant_max=0.9, nulls_list=['']"
2,,auto_to_category,"null_max=0.7, unique_max=20"
3,,to_bool_type,"headers=online, drop=False, bool_map={1: True}"
4,,to_category_type,"headers=['gender', 'profession'], drop=False"
5,,to_date_type,"headers=start_date, drop=False, as_num=False, day_first=True, year_first=False"
6,,to_float_type,"dtype=['float'], exclude=False, fillna=nan, errors=coerce, precision=3"
7,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['', 'nan']"


### Bulk upload the Attribute Dictionary

In [20]:
# load the Transitioning instance
tr_dict = Transition.from_env('synthetic_customer_dictionary')


In [21]:
tr_dict.report_source()

Unnamed: 0,param,Property Source,Data Source,Persist Source
0,connector_name,pm_data_synthetic_customer_dictionary,origin_connector,persist_connector
1,resource,config_data_synthetic_customer_dictionary.yaml,synthetic_customer_dictionary.csv,transition_synthetic_customer_dictionary_v0.00.p
2,connector_type,yaml,csv,pickle
3,location,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/contracts/synthetic_customer_dictionary,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/data/0_raw,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/data/1_transition
4,module_name,ds_discovery.handlers.pandas_handlers,ds_discovery.handlers.pandas_handlers,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler,PandasSourceHandler,PandasPersistHandler
6,modified,0,1561365596,0
7,kwargs,,"encoding='latin1' sep=','",


In [22]:
# get the Transitioned Canonical
df_cust_dict = tr_dict.load_clean_canonical()
tr.canonical_report(df_cust_dict, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.062,0.067,15,15,Sample: start | single num | status
1,description,object,0.0,0.062,16,16,Sample: the prossession of the customer | a single number value | M = M...


In [23]:
# bulk upload the dictionary into our notes
tr.upload_notes(df_cust_dict, label_header='attribute', text_header='description', note_type='dictionary', selection=df.columns.to_list())

In [24]:
tr.report_notes(stylise=True)

Unnamed: 0,section,label,date,text
0,notes,comment,2019-08-20 18:08,The file is a synthetic customer data file created for this demonstration
1,,source,2019-08-20 18:08,This was generated using the Discovery Behavioral Synthetic Data Generator
2,,,2019-08-20 18:08,The script to rerun the data generation can be found in the synthetic scripts folder
3,attribute,auto_remove,2019-08-20 18:08,"With >90% predominance removes ['single_cat', 'single_num', 'weight_cat', 'weight_num']"
4,,,2019-08-20 18:08,With >99% nulls removes ['null']
5,,start,2019-08-20 18:08,changing this to start_date so it being a date is obvious
6,dictionary,age,2019-08-20 18:08,Customer age
7,,balance,2019-08-20 18:08,The current customer balance
8,,forename,2019-08-20 18:08,customer forename
9,,gender,2019-08-20 18:08,"M = Male, F = Female"


### Report, Persist and Validation

In [25]:

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist_contract()

# save the clean file
tr.save_clean_canonical(df)

In [26]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,float64,15.0%,0.5%,425,422,max=85.381 | min=20.468 | mean=47.12
1,balance,float64,0.0%,0.4%,500,496,max=992.79 | min=31.57 | mean=187.96
2,forename,object,0.0%,0.4%,500,499,Sample: Emanuel | Letitia | Terence
3,gender,category,0.0%,62.2%,500,2,F|M
4,id,object,0.0%,0.2%,500,500,Sample: CU_9812468 | CU_3196880 | CU_6542121
5,last_login,object,0.0%,0.2%,500,500,Sample: 03-12-19 01:23 | 01-18-19 11:25 | 03-28-19 02:08
6,online,bool,0.0%,78.8%,500,2,True | False
7,profession,category,10.0%,24.2%,450,15,Actuary|Assistant Manager|Assistant Professor|Dental Hygienist|Design Engineer|Librarian|Nurse Pract...
8,start_date,datetime64[ns],0.0%,1.4%,500,271,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
9,status,category,0.0%,54.6%,500,4,Active|Closed|Pending|Suspended


---------
### Re-Running the Pipeline Contract

In [27]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,float64,15.0%,0.5%,425,422,max=85.381 | min=20.468 | mean=47.12
1,balance,float64,0.0%,0.4%,500,496,max=992.79 | min=31.57 | mean=187.96
2,forename,object,0.0%,0.4%,500,499,Sample: Kurtis | Jarrod | Kyan
3,gender,category,0.0%,62.2%,500,2,F|M
4,id,object,0.0%,0.2%,500,500,Sample: CU_3662615 | CU_9775428 | CU_8515098
5,last_login,object,0.0%,0.2%,500,500,Sample: 04-16-19 08:32 | 04-16-19 00:33 | 03-25-19 15:36
6,online,bool,0.0%,78.8%,500,2,True | False
7,profession,category,10.0%,24.2%,450,15,Actuary|Assistant Manager|Assistant Professor|Dental Hygienist|Design Engineer|Librarian|Nurse Pract...
8,start_date,datetime64[ns],0.0%,1.4%,500,271,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
9,status,category,0.0%,54.6%,500,4,Active|Closed|Pending|Suspended
