# Testing association of compliance and cholesterol values

We have 164 rows, each representing a person taking a medication. For
each row (each person) we have a pair of measurements. The first is
their percent compliance in taking their prescribed medication. The
second is the decrease of their blood cholesterol. We want to see
whether the observed association of these values is plausible on the
null hypothesis — that there is no association.

In [None]:
# Load the Numpy library for arrays.
import numpy as np
# Load the Pandas library for loading and selecting data.
import pandas as pd
# Plotting library.
import matplotlib.pyplot as plt

# Set up the random number generator
rnd = np.random.default_rng()

# Data as arrays.
df = pd.read_csv('data/cholost.csv')
compliance = np.array(df['percent_compliance'])
cholesterol = np.array(df['cholesterol_decrease'])

# Rename the two sequences to match the description in the text.
x = compliance
y = cholesterol

# Step 1 above
actual_prod = x * y
# Note: actual_sum = 439,141
actual_sum = np.sum(actual_prod)
# Show the result.
actual_sum

In [None]:
# Set the number of trials
n_trials = 10_000

# An empty array to store the trial results.
results = np.zeros(n_trials)

# Do 10,000 experiments (step 4)
for i in range(n_trials):
    # Step 2 above.
    y_random = rnd.permuted(y)
    # Step 3 above.
    fake_prod = x * y_random
    fake_sum = np.sum(fake_prod)
    # Step 3 above
    results[i] = fake_sum
    # Step 4 above

# Step 5 above
k = np.sum(results >= actual_sum)
kk = k / n_trials

print('Proportion product sums >= observed:', kk)