## Synthetic data builder

In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

In [2]:
from ds_discovery import SyntheticBuilder

In [3]:
sb = SyntheticBuilder.from_memory()

### Mapping

In [4]:
headers = list('ABCDE')
replace = ['target', 'num1', 'num2', 'str1', 'str2']
df_mapping = pd.DataFrame({'origin': headers, 'other': replace})

sb.add_connector_uri('mapping', 'source/mapping.csv')
sb.save_canonical('mapping', df_mapping)

### Difference

In [5]:
df_origin = pd.DataFrame(data={"A": list("ABCDEFG"), "B": [1, 2, 5, 5, 3, 3, 1], 'C': [0,  2, 3, 3, 3, 2, 1], "D": ['B', 'C', 'A', 'A', 'F', 'E', 'G'],  'E': ['L', 'L',   'M', 'N', 'J', 'K', 'M']})
df_other  = pd.DataFrame(data={"A": list("ABCDEFG"), "B": [1, 2, 5, 5, 3, 3, 1], 'C': [27, 2, 3, 4, 3, 2, 0], "D": ['B', 'C', 'A', 'B', 'F', 'E', 'GG'], 'E': ['L', 'DXL', 'M', 'N', 'J', 'K', 'P']})

sb.add_connector_uri('origin', 'source/origin_sample.csv')
sb.add_connector_uri('other', 'source/other_sample.csv')

sb.save_canonical('origin', df_origin)
sb.save_canonical('other', df_other)