# Tutorial 1: Hospital
## 1. Specify Input

In [1]:
data_path = 'data/hospital.csv'
out_path = 'results/'
out_dc_name = 'hospital_constraints.txt'
GLD = True

## 2. Initialize Model and Load Data

In [2]:
from profiler.core import Profiler
from profiler.detector.helper import process_heatmap
import datetime
import os

ModuleNotFoundError: No module named 'gensim'

In [None]:
session = 'demo'

In [None]:
pf = Profiler(use_db=False, ID=session)

In [None]:
# dropna: if set to true, it will drop any tuple with null values
# dropcol: drop columns by list of column names
# workers: number of processors used for multiprocessing
pf.load_data(input_type='file', path=data_path, 
             dropna=False, dropcol=['Address3'], fillna=True,
             use_embedding=True, embedding_size=20, workers=10)

### [Optional] Change the Data Type of Attributes
#### Data Types
* **numeric**: reason the differences by examing if |t1.A - t2.A|/max(differences of all pairs of tuples in A)) <= error bound or by examing user defined function f: (R^n, R^n) -> [0,1]^n (see section 5 for an example of user-defined function)
* **categorical**: reason the differences by examing t1.A == t2.A
* **embeddable_categorical**: reason the differences by examing cosine_distance(embed(t1.A), embed(t2.A))

#### Change Data Types
* In example dataset, it contains noises like "537x5" as a zip code, so there has no columns recognized as numerical. 
* In this case, you have the option to force it back to numeric. But be careful that values like "537x5" will be recognized as NaN. 
* you can also change data type to 'embeddable_categorical' if you want to use word embedding over the attribute. 

In [None]:
pf.change_dtypes(['ZipCode','PhoneNumber','State'], ['numeric','categorical','embeddable_categorical'])

### load embedding

In [None]:
pf.load_embedding()

## 3. Run GLD

In [None]:
# save_heatmap: options are 'csv' and 'none'
# alpha_cov: parameter controlling sparsity of cov in glasso
# alpha_corr: parameter controlling sparsity of corr in glasso
# differences: is set to true, doing training data transformation
# params for GLD only (set differences to true):
# sample_frac: take samples with certain fraction when creating training data in each iteration
# total_frac: take samples of training data after concatenating all training data
# user_defined: user defined function for numeric attribute
# error_bound: paramater used for default function for numeric attribute
model_run_time = pf.run_graphical_lasso(hm_path=out_path, save_heatmap='csv',
                                        sample_frac=1, total_frac=1,
                                        alpha_cov=0.005, differences=GLD,
                                        decompose=True, undirected=False,
                                        use_cov=True, use_corr=True,
                                        sort_training_data=True,
                                        error_bound=0.0001, user_defined=None)

In [None]:
# if there has no outputs, just rerun this cell
pf.visualize_heatmap(pf.heatmap['cov'], save=False, title=None, filename="hospital_heatmap.png")

## 4. Generate FDs

In [None]:
# out_dc: if set to true, will write dcs to file specified by out_path and out_name
# display: if set to true, will print FD in the format of LHS -> RHS

# above_threshold and topk are exclusive options!
# above_threshold: if set to true, will filter the heatmaps with certain threshold, then output FDs
# t0: threshold when above_threshold set to true
# topk: if set to true, will output FDs with dimension up to k on the LHS

space, time, count = process_heatmap(pf.heatmap['cov'], out_dc=False, outfile=os.path.join(out_path, out_dc_name),
                                     normalize=False, take_abs=True, subsets=False,
                                     above_threshold=True, t0=0.3, 
                                     topk=False, k=6,
                                     display=True)

In [None]:
space, time, count = process_heatmap(pf.heatmap['cov'], out_dc=True, outfile=os.path.join(out_path, session + '_' +out_dc_name),
                                     normalize=False, take_abs=True, subsets=True,
                                     above_threshold=True, t0=0.3, 
                                     topk=False, k=6,
                                     display=True)

## 5. Miscellaneous
### Parameters available for load_data

In [None]:
pf.dataEngine.param

### Example of user-defined function for numerical attribute
* in returned list: 
    * if set directed, 1 for different and 0 for same;
    * if set undirected, -1 for different and 1 for same;
* no need to worry about nan, it will be handled by the system

In [None]:
import numpy as np
def func(a, b):
    # shape of a: (num_samples, 1)
    # shape of b: (num_samples, 1)
    sqr = np.square(a - b)
    return sqr/(max(sqr)) <= 0.001