In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Part 1: Setting up the data ##

In [None]:
# Load the 2024 weather data
weather_2024 = Table().read_table('merced_weather_2024_september.csv')
weather_2024 = weather_2024.select(['Date', 'Maximum Temperature', 'Minimum Temperature'])
weather_2024

#weather_2024.hist('Maximum Temperature')

In [None]:
# Load the 2004 weather data
weather_2004 = Table().read_table('merced_weather_2004_september.csv')
weather_2004 = weather_2004.select(['Date', 'Maximum Temperature', 'Minimum Temperature'])
weather_2004


In [None]:
# Combine the sets of weather data into a new table with labels

weather_columns = np.append(weather_2024.column('Maximum Temperature'), weather_2004.column('Maximum Temperature'))
labels = np.append(30*['new'], 30*['old'])

weather_data = Table().with_columns(
    'Temperature', weather_columns,
    'Label', labels
)

#weather_data.show()

weather_data.hist('Temperature', group = 'Label')

In [None]:
# Compare the averages
weather_data.group('Label', np.average)

In [None]:
weather_2004.sort('Maximum Temperature').show()

## Part 2: The permutation test ##

In [None]:
def difference_of_means(data_table, column):
    """Takes: name of table and column to group by
    Returns: The difference of means between the groups."""

    means = data_table.group(column, np.average).column(1)
    return means.item(0) - means.item(1)

In [None]:
difference_of_means(weather_data, 'Label')

In [None]:
# Now let's make a way to shuffle the labels. We do this by drawing
# without replacement from the column of all labels:

shuffled_labels = weather_data.sample(with_replacement = False).column(1)
original_and_shuffled = weather_data.with_column(
    'Shuffled Label', shuffled_labels)

original_and_shuffled

In [None]:
# Now let's just look at the data with shuffled labels only and
# compute the temperature deviation:

shuffled_only = original_and_shuffled.select('Temperature', 'Shuffled Label')
shuffled_only.group('Shuffled Label', np.average).show()

difference_of_means(shuffled_only, 'Shuffled Label')

In [None]:
# Now we're ready to implement the permutation test!

differences = make_array()
repetitions = 20000

for i in np.arange(repetitions):
    shuffled_labels = weather_data.sample(with_replacement = False).column(1)
    shuffled_data = weather_data.with_column(
        'Shuffled Label', shuffled_labels).drop('Label')

    new_difference = difference_of_means(shuffled_data, 'Shuffled Label')
    differences = np.append(differences, new_difference)

In [None]:
# Now let's make a histogram. This represents the distribution 
# UNDER THE NULL HYPOTHESIS that the samples come from the 
# same underlying distribution.

Table().with_column('Difference between group averages', differences).hist(bins=20)
observed_difference = difference_of_means(weather_data, 'Label')
plots.scatter(observed_difference, .025, c='r', s=100)

## Part 3: Computing the p-Value and computing percentiles ##

In [None]:
# We need to count how many outcomes were more extreme

p_value = sum(differences > observed_difference) / repetitions
print('Reported p-value for the difference: ' + str(p_value))

In [None]:
# Find the 25th percentile of the distribution:
print('25th percentile: ' + str(percentile(25, differences)))

# Find the 50th percentile of the distribution:
print('50th percentile: ' + str(percentile(50, differences)))

# Find the 75th percentile
print('75th percentile: ' + str(percentile(75, differences)))

# Find the 95th percentile
print('95th percentile: ' + str(percentile(95, differences)))

# Find the 99th percentile
print('99th percentile: ' + str(percentile(99, differences)))