In [5]:
%run ../base_setup.ipynb

DTU: 1.07.013
DBU: 1.00.018


## Transitioning
* Creating an instance of the Transitioning Class giving it a reference name
* The reference name identifies the unique transitioning pipeline contract

In [6]:
tr = Transition('template')

## Pipeline Contract
### Data Source

#### Find the files
* Use the discovery `find_file(...)` to explore the names of the raw files

In [3]:
files = tr.discover.find_file('.csv', root_dir=tr.file_pm.data_path)
files

* the output DataFrame can be used to directly build the file path

In [None]:
line = files.iloc[0:1].values
file = line.item(0)

#### Build the source contract

In [None]:
df = tr.set_source_contract(file, source_format='csv', sep=',', encoding='latin1', load=False)

In [None]:
df = tr.get_source_data()

## Data Discovery
* use the discovery tools to visually inspect the raw data
* using the `filter_columns(...)` helps us isolate rows in large files


#### Data Dictionary

In [None]:
# note: empty 'dtype' shows all types
df_filtered = tr.clean.filter_columns(df, dtype=[])
tr.discover.data_dictionary(df_filtered)

#### Stats Dictionary

In [None]:
df_filtered = tr.clean.filter_columns(df, dtype=[])
tr.discover.stat_dictionary(df_filtered)

#### Analysis Dictionary

In [None]:
df_filtered = tr.clean.filter_columns(df, dtype=[])
tr.discover.analysis_dictionary(df_filtered)

## Enrichment
#### Add any Observations
* Include any general observations
* Note any attribute anomolies or points of interests for future reference
* capture any Terms of Reference 

In [None]:
# tr.clean.filter_headers(df, dtype=[])

In [None]:
# Auto clean

tr.set_cleaner(tr.clean.clean_header(df, inplace=True))

# _ = tr.clean.auto_remove_columns(df, null_min=0.99, inplace=True)
# tr.set_cleaner(tr.clean.get_auto_remove_settings(null_min=0.99))

# _ = tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True)
# tr.set_cleaner(tr.clean.get_auto_category_settings(unique_max=20, null_max=0.7))


In [14]:
# Convert Catagories and dates
tr.set_cleaner(tr.clean.to_category_type(df, headers=[''], inplace=True))

tr.set_cleaner(tr.clean.to_date_type(df, headers=[''], inplace=True))


In [None]:
#tidy up any to remove manually and convert any booleans
tr.set_cleaner(tr.clean.remove_columns(df, headers=[''], inplace=True))

bm = {1: True, 'Y': True}
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map=bm, headers=[''], inplace=True))


In [None]:
# finally just format any floats ints and strings

tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], inplace=True))

tr.set_cleaner(tr.clean.to_int_type(df, dtype=['int'], inplace=True))

tr.set_cleaner(tr.clean.to_str_type(df, headers=[''], inplace=True))

In [None]:
# Create the excel data dictionary
tr.create_data_dictionary(df)

# save the clean file
tr.save_clean_file(df)

In [None]:
# check the results worked
df = tr.load_clean_file()
tr.discover.data_dictionary(df)

In [None]:
# Add notes to the contract
tr.data_pm.add_notes('This is dummy file and not representative')
tr.data_pm.add_notes('generated from DTU synthetic file builder', key='source')
tr.save()
