# Sum of squared differences for pig rations

Here we are working on the question — are the observed differences
between pig ration measured weight gains for different rations greater
than we would expect to see as a result of random sampling in the
null-world.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set up the random number generator.
rnd = np.random.default_rng()

# Load data file.y
rations_df = pd.read_csv('data/pig_rations.csv')
# Show the first five rows.
rations_df.head()

In [None]:
# Get arrays for each ration.
# A
a_rows = rations_df[rations_df['ration'] == 'A']
a_weights = np.array(a_rows['weight_gain'])
# B
b_rows = rations_df[rations_df['ration'] == 'B']
b_weights = np.array(b_rows['weight_gain'])
# C
c_rows = rations_df[rations_df['ration'] == 'C']
c_weights = np.array(c_rows['weight_gain'])
# D
d_rows = rations_df[rations_df['ration'] == 'D']
d_weights = np.array(d_rows['weight_gain'])
# Concatenate into one long array.
all_weights = np.concatenate([a_weights, b_weights, c_weights, d_weights])
# Show the concatenated array.
all_weights

In [None]:
n_trials = 10_000

# An array to store the result of each trial.
results = np.zeros(n_trials)

# Do 10000 trials
for i in range(n_trials):
    # Shuffle all the weight gains.
    shuffled = rnd.permuted(all_weights)
    # Split into 4 now random samples.
    fake_a = shuffled[:12]
    fake_b = shuffled[12:24]
    fake_c = shuffled[24:36]
    fake_d = shuffled[36:]
    # Sum the weight gains for the 4 resamples.
    sum_a = np.sum(fake_a)
    sum_b = np.sum(fake_b)
    sum_c = np.sum(fake_c)
    sum_d = np.sum(fake_d)
    # Find the differences between all the possible pairs of resamples.
    a_b = sum_a - sum_b
    a_c = sum_a - sum_c
    a_d = sum_a - sum_d
    b_c = sum_b - sum_c
    b_d = sum_b - sum_d
    c_d = sum_c - sum_d
    # Put the differences into an array.
    fake_diffs = np.array([a_b, a_c, a_d, b_c, b_d, c_d])
    # Square them to give six squared differences.
    sq_fake_diffs = fake_diffs ** 2
    # Sum the squares.
    sum_sq_fake_diffs = np.sum(sq_fake_diffs)
    # Keep track of the total for each trial.
    results[i] = sum_sq_fake_diffs
    # End one trial, go back and repeat until 10000 trials are complete.

# Produce a histogram of the trial results.
plt.hist(results, bins=25)
plt.title('Null distribution of sum of squared differences')
plt.xlabel('Sum of squared differences in null world')

# Find out how many trials produced differences among groups as great as
# or greater than those observed.
k = np.sum(results >= 5299)
# Convert to a proportion.
kk = k / n_trials
# Print the result.
print('Number of sum of squared differences >= 5299:', kk)