In [19]:
%run ../base_setup.ipynb

DTU: 1.08.031
DBU: 1.01.003


# Accelerated Machine learning
## Transitioning Contract

In [20]:
tr = Transition('synthetic')

### Data Source

In [21]:
tr.set_source_contract(source_name='synthetic.csv', source_format='csv', sep=',', encoding='latin1', load=False)

### Retrieve & Observations

In [22]:
df = tr.get_source_data()

In [23]:
tr.discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,age,float64,0.15,4250,4040,max=89.603 | min=20.003 | mean=46.38
1,balance,float64,0.0,5000,4382,max=979.54 | min=8.72 | mean=185.9
2,forename,object,0.0,5000,3925,Sample: Eloise | Enya | Carl
3,gender,object,0.0,5000,2,Sample: M | F
4,id,object,0.0,5000,5000,Sample: CU_7162557 | CU_2028014 | CU_7297233
5,,float64,1.0,0,0,max=nan | min=nan | mean=nan
6,online,float64,0.35,3250,2,max=1.0 | min=0.0 | mean=0.21
7,profession,object,0.1,4500,15,Sample: Human Resources Assistant IV | Safety Technician III | Informat...
8,single cat,object,0.4,3000,1,Sample: A
9,single num,float64,0.2,4000,1,max=1.0 | min=1.0 | mean=1.0


In [24]:
tr.notes_add(text='The file is a synthetic data file created for this demonstration')

In [25]:
tr.notes_add(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.notes_add(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [26]:
tr.notes_add(label='attr: null', text="Here for demo of removal of nulls")
tr.notes_add(label='attr: weight_cat', text="Demonstration of removal of columns with predominant values")
tr.notes_add(label='attr: weight_cat', text="the value 'A' is over 95% predominant")
tr.notes_add(label='attr: start', text="changing this to start_date so it being a date is obvious")

In [27]:
tr.notes_report(stylise=True)

Unnamed: 0,label,date,text
0,attr: null,2019-05-23 14:15,Here for demo of removal of nulls
1,attr: start,2019-05-23 14:15,changing this to start_date so it being a date is obvious
2,attr: weight_cat,2019-05-23 14:15,Demonstration of removal of columns with predominant values
3,,2019-05-23 14:15,the value 'A' is over 95% predominant
4,notes,2019-05-23 14:15,The file is a synthetic data file created for this demonstration
5,source,2019-05-23 14:15,This was generated using the Discovery Behavioral Synthetic Data Generator
6,,2019-05-23 14:15,The script to rerun the data generation can be found in the synthetic scripts folder


------------
### Selection, Filter and Typing

In [28]:
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [29]:
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90,inplace=True, nulls_list=['']))

In [30]:
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [31]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', inplace=True))


In [32]:
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [33]:
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], inplace=True))

### Report, Persist and Validation

In [34]:
# Create the excel data dictionary
tr.create_data_dictionary(df)

# save the clean file
tr.save_canonical(df)

In [35]:
# check the results worked
df = tr.load_canonical()
tr.discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,age,float64,0.15,4250,4040,max=89.603 | min=20.003 | mean=46.38
1,balance,float64,0.0,5000,4382,max=979.54 | min=8.72 | mean=185.9
2,forename,object,0.0,5000,3925,Sample: Marilyn | Edwina | Morton
3,gender,category,0.0,5000,2,F|M
4,id,object,0.0,5000,5000,Sample: CU_4094402 | CU_9497709 | CU_6771030
5,online,bool,0.0,5000,2,True | False
6,profession,category,0.1,4500,15,Accountant I|Assistant Professor|Data Coordiator|Developer I|Food Chemi...
7,start_date,datetime64[ns],0.0,5000,364,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
8,surname,object,0.0,5000,5000,Sample: Lambdin | Sardina | Faulkenberry


---------
### Re-Running the Pipeline Contract

In [36]:
df = tr.refresh_canonical()

tr.discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,age,float64,0.15,4250,4040,max=89.603 | min=20.003 | mean=46.38
1,balance,float64,0.0,5000,4382,max=979.54 | min=8.72 | mean=185.9
2,forename,object,0.0,5000,3925,Sample: Gerald | Bianka | Sofia
3,gender,category,0.0,5000,2,F|M
4,id,object,0.0,5000,5000,Sample: CU_3701874 | CU_3248979 | CU_1253338
5,online,bool,0.0,5000,2,True | False
6,profession,category,0.1,4500,15,Accountant I|Assistant Professor|Data Coordiator|Developer I|Food Chemi...
7,start,object,0.0,5000,364,Sample: 12-05-18 | 10-22-18 | 06-24-18
8,surname,object,0.0,5000,5000,Sample: Climes | Pearyer | Wene
