# Causality

In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

## Comparing Two Samples ##

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

# Test Statistic

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
def difference_of_means(table, group_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of mean birth weights of the two groups"""
    
    # table with the two relevant columns
    reduced = table.select('Birth Weight', group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column('Birth Weight average')
    
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
staff = Table().with_columns(
    'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
    'Ages', make_array(29, 28, 34, 41)
)

In [None]:
staff.sample()

In [None]:
staff.sample(with_replacement = False)

In [None]:
shuffled_names = staff.sample(with_replacement = False).column('Names')
staff.with_column('Shuffled', shuffled_names)

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
difference_of_means(original_and_shuffled, 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Maternal Smoker')

# Permutation Test

In [None]:
def one_simulated_difference_of_means():
    """Returns: Difference of mean birthweights
    of babies of smokers vs non-smokers after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = births.sample(with_replacement=False).column('Maternal Smoker')
    
    # table of numerical variable and shuffled labels
    shuffled_table = births.select('Birth Weight').with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, 'Shuffled Label')   

In [None]:
one_simulated_difference_of_means()

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference_of_means()
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

## Randomized Controlled Experiment ##

In [None]:
rct = Table.read_table('bta.csv')
rct.show()

In [None]:
rct.pivot('Result', 'Group')

In [None]:
rct.group('Group', np.average)

## Testing the Hypotheses ##

In [None]:
shuffled_labels = rct.sample(with_replacement=False).column('Group')

In [None]:
original_and_shuffled = rct.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled

In [None]:
original_and_shuffled.select('Result', 'Group').group(
    'Group', np.average)

In [None]:
original_and_shuffled.select('Result', 'Shuffled Label').group(
    'Shuffled Label', np.average)

In [None]:
def difference_of_proportions(table, group_label):
    """Takes: name of table,
    column label that indicates which group the row relates to
    Returns: Difference of proportions of 1's in the two groups"""
    
    # table with the two relevant columns
    reduced = table.select('Result', group_label)  
    
    # table containing group means
    proportions_table = reduced.group(group_label, np.average)
    
    # array of group means
    proportions = proportions_table.column('Result average')
    
    return proportions.item(1) - proportions.item(0)

In [None]:
observed_diff = difference_of_proportions(rct, 'Group')
observed_diff

In [None]:
def one_simulated_difference_of_proportions():
    """Returns: Difference of proportions of 1's
    of treatment and control groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = rct.sample(with_replacement=False).column('Group')
    
    # table of numerical variable and shuffled labels
    shuffled_table = rct.select('Result').with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_proportions(shuffled_table, 'Shuffled Label')   

In [None]:
one_simulated_difference_of_proportions()

In [None]:
simulated_diffs = make_array()

for i in np.arange(20000):
    sim_diff = one_simulated_difference_of_proportions()
    simulated_diffs = np.append(simulated_diffs, sim_diff)

In [None]:
col_name = 'Difference between Treatment and Control'
Table().with_column(col_name, simulated_diffs).hist(bins=np.arange(-0.6, 0.81, 0.15))
plots.ylim(-0.1, 2)
plots.scatter(observed_diff, 0, color='red', s=40);

In [None]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)