# The DoWhy package provides several utilities for synthesizing data.
### Knowing how data is generated is a good starting point when learning a new methodology or algorithm.

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import dowhy.datasets 

# Define auxiliary functions

In [None]:
def plot_gml(gml_graph):
    G = nx.parse_gml(gml_graph)
    pos=nx.spring_layout(G)
    nx.draw_networkx(G, pos, with_labels=True, node_size=1000, node_color="darkorange")
    return(plt.show())

In [None]:
def describe_synthetic_data(synthetic_data):
 if (synthetic_data['gml_graph'] != None) :
    plot_gml(synthetic_data["gml_graph"])                                               
 synthetic_data_df=synthetic_data["df"]
 #print(synthetic_data_df.head())
 print('------- Variables --------')
 print('Treatment vars:'      , synthetic_data['treatment_name'])
 print('Outcome vars:'        , synthetic_data['outcome_name'])
 print('Common causes vars:'  , synthetic_data['common_causes_names'])
 print('Instrument vars:'     , synthetic_data['instrument_names'])
 print('Effect Modifier vars:', synthetic_data['effect_modifier_names'])
 print('Frontdoor vars:'      , synthetic_data['frontdoor_variables_names'])
 print('Treatment vars:', synthetic_data['outcome_name'])
 print('-------- Corr -------')
 print(synthetic_data_df.corr())
 print('------- Head --------')
 return(synthetic_data_df)

# linear_dataset 

```
 dowhy.datasets.linear_dataset(beta,
                              num_common_causes,
                              num_samples,
                              num_instruments=0,
                              num_effect_modifiers=0,
                              num_treatments=1,
                              num_frontdoor_variables=0,
                              treatment_is_binary=True,
                              treatment_is_category=False,
                              outcome_is_binary=False,
                              stochastic_discretization=True,
                              num_discrete_common_causes=0,
                              num_discrete_instruments=0,
                              num_discrete_effect_modifiers=0,
                              stddev_treatment_noise=1,
                              stddev_outcome_noise=0.01,
                              one_hot_encode=False
                              )
```

```
Outputs a dictionary
{
        "df": data,
        "treatment_name": treatments,
        "outcome_name": outcome,
        "common_causes_names": common_causes,
        "instrument_names": instruments,
        "effect_modifier_names": effect_modifiers,
        "frontdoor_variables_names": frontdoor_variables,
        "dot_graph": dot_graph,
        "gml_graph": gml_graph,
        "ate": ate
}
```

```
The function generates a data set with [num_samples] records.
v variables - are the treatments where abs(*beta*) defines thier magnitude if continuos
y - is the outcome variable where abs(*beta*) defines its magnitude. Basically:
 y = normal(0, stddev_outcome_noise) + t @ beta [where @ is a numpy matrix multiplication alowing for beta be a vector]

the W variables commonly cause both the treatment and the outcome and are iid. if continuos then they are Norm(mue = Unif(-1,1), sigma = 1)

Instrument variables are labled Z and each one affects all treatments. i.e. if there is one instument and two treatments then z0->v0, z0->v1

X as the effect modifiers. Their number should be 0 or equal to the number of treatments (bug) they are Norm(mue = Unif(-1,1), sigma = 1)


Quartiles are used when discretised variables are spesified. They can be hot encoded.

## *beta* - defines the causality effect of interest. It affects the value magniturde for the continous treatments and outcomes variables - 'y' thus also the magnitude of the resulting ate.
### In most cases the absolute value is considered

In [None]:
# create a dataset with 10 observations one binary treatment and a continous outcome affected by one common cause
synthetic_data = dowhy.datasets.linear_dataset(beta = 100,
                                               num_common_causes = 1,
                                               num_samples =10
                                               )
describe_synthetic_data(synthetic_data).head()

In [None]:
# Two continuos treatments, no common cause, an instrumental variable and two effect modifiers - linearly added appropriately
synthetic_data = dowhy.datasets.linear_dataset(
                              beta                          = 100,
                              num_common_causes             =   0,
                              num_samples                   =  20,
                              num_instruments               =   1,
                              num_effect_modifiers          =   2, 
                              num_treatments                =   2,
                              num_frontdoor_variables       =   0,
                              treatment_is_binary           = False,
                              treatment_is_category         = False,
                              outcome_is_binary             = False,
                              stochastic_discretization     = True,
                              num_discrete_common_causes    =   0,
                              num_discrete_instruments      =   0,
                              num_discrete_effect_modifiers =   0,
                              stddev_treatment_noise        =   1,
                              stddev_outcome_noise          =  0.01,
                              one_hot_encode                = False
                                               )
describe_synthetic_data(synthetic_data).head()

In [None]:
# Hot Encoding
synthetic_data = dowhy.datasets.linear_dataset(
                              beta                          = 100,
                              num_common_causes             =   2,
                              num_samples                   =  20,
                              num_instruments               =   1,
                              num_effect_modifiers          =   1, 
                              num_treatments                =   1,
                              num_frontdoor_variables       =   1,
                              treatment_is_binary           = False,
                              treatment_is_category         = False,
                              outcome_is_binary             = False,
                              stochastic_discretization     = True,
                              num_discrete_common_causes    =   1, #of the total num_common_causes
                              num_discrete_instruments      =   1,
                              num_discrete_effect_modifiers =   1,
                              stddev_treatment_noise        =   1,
                              stddev_outcome_noise          =  0.01,
                              one_hot_encode                = True
                                               )
describe_synthetic_data(synthetic_data).head()