In [2]:
# import dependencies
# import sys
# !{sys.executable} -m pip install --user install tensorflow==2.8.0. 
import numpy as np
import pandas as pd
import glacierml as gl
import matplotlib.pyplot as plt
E_delta_a, areas, cols, train = gl.load_LOO_data(include_train = True)
pd.set_option('display.max_columns', None)


AttributeError: module 'glacierml' has no attribute 'load_LOO_data'

In [None]:
data

# Plot the glacier volume cumulative distribution function (CDF)

In [None]:
data_1 = np.mean(data[cols],axis = 1) * data['Area']
data_2 = data['FMT'] * data['Area']
#sort data


x_1 = np.sort(data_1)
# x = data_1
#calculate CDF values
y_1 = 1. * np.arange(len(data_1)) / (len(data_1) - 1)


x_2 = np.sort(data_2)
# x = data
#calculate CDF values
y_2 = 1. * np.arange(len(data_2)) / (len(data_2) - 1)

#plot CDF
fig, ax = plt.subplots(1,1,figsize=(8, 8))

plt.subplots_adjust(hspace=0.5)

plt.suptitle('RGI Volume Cumulative Distribution Function', fontsize=18, y=0.93)
fig.patch.set_facecolor('w')


plt.subplots_adjust(hspace=0.5)
plt.plot(
    x_1, 
    y_1,
    c = 'blue',
    label = 'Farinotti Volume'
    
)
plt.plot(
    x_2, 
    y_2,
    c = 'orange',
    label = 'Edasi Volume'
)
plt.legend()
ax.set_xscale('log')
ax.set_xlabel('Ice Volume')
ax.set_ylabel('% of Glaciers Containing Ice')

k = 1
for lambd in (1,10,100):
    plt.plot(x_1, 1 - np.exp(- (x_1/lambd)**k ),'--')
# plt.savefig('figs/cdf/cdf.svg')
plt.show()

# Plot probability density function (PDF)

In [None]:
plt.subplots()
plt.hist(np.log10(data_1), 250)
plt.hist(np.log10(data_2), 250)
plt.yscale('log')
# plt.xlim(0,1e5)
plt.show()

# Use the KS test to show that the differences are significantly different 

In [None]:
from scipy.interpolate import interp1d

In [None]:
f1 = interp1d(x_1,y_1)
f2 = interp1d(x_2,y_2)

In [None]:
np.log10(x_2.min())

In [None]:
x = np.logspace(-0.96, 6, 100)
D = np.max(abs(f2(x) - f1(x)))

# Resample to test difference of sums

We want to address the question, "Is our total ice volumes significantly different from published estimates?"  The following shows that the answer is yes.

Another way to ask the question is, "What is the probability that the published estimate is actually less than our value?"

Let's randomly re-sample 90% of the distributions 1000 times and each time, calculate the sum of the sample.

In [None]:
from tqdm import tqdm
N1 = len(data_1)
assert len(data_1) == len(data_2)
resample_rate = 0.9
Nsample = int(N1 * resample_rate)
X = []
for i in tqdm(range(1000)):
    sum1 = np.sum(data_1.to_numpy()[np.random.choice(N1, size=Nsample, replace=False)])
    sum2 = np.sum(data_2.to_numpy()[np.random.choice(N1, size=Nsample, replace=False)])

    X.append (sum1-sum2)

In [None]:
plt.subplots()
plt.hist(np.array(X)/resample_rate/1e6,20,density=True)
plt.title(r'Resampled difference between the $V_F$ and $V_E$')
plt.ylabel('Frequentist Probability')
plt.xlabel('Ice Volume (x1000 cu km)')
plt.show()

In [None]:
fig,ax=plt.subplots()
for i in range(len(X)):
    ax.plot(i,np.mean(X[0:i])/1e6/resample_rate,'ok')
plt.title('Convergence of the mean')
plt.xlabel('Iteration')
plt.ylabel('Mean (x1000 cu km)')
plt.show()

### The distribution looks approximately normal (ignore the skew for now). So what is the probability that $V_F == V_E$ given the resampled difference mean and standard deviation?

In [None]:
mu  = np.mean(X)/resample_rate/1e6
sig = np.std(X)/resample_rate/1e6
print(f'Mean of Differences = {mu:.2f}')
print(f'Std of Differences = {sig:.2f}')
from scipy.stats import norm
value = 0 # this corresponds to zero difference
probability=norm.pdf(value,loc = mu, scale=sig)
print(f'Probability of equality of volumes = 10**{np.log10(probability):.2f}')

### So the probabilty is extremely low that the two estimates are the same!
This is similar to the Z-test with large N.  Let's calculate what the Z-test gave us to see whether the the MC resampling added any value.

In [None]:
mu1 = np.sum(data_1.to_numpy())
mu2 = np.sum(data_2.to_numpy())
std1 = 41
std2 = 42.19
N = len(data_2.to_numpy())
Z = np.abs(mu1-mu2) / np.sqrt(std1**2/N+std2**2/N)
print(f'log10(Z) = {np.log10(Z)}')

print(f'p  = {norm.sf(abs(Z))}')

For such a large p-value, scipy just rounds the probabilty to zero. I think that settles it.  The differences are significant.

# Next level up:  Account for the uncertainty as quantified in the bootstraps

In [None]:
m_F = (data['Farinotti Mean Thickness'] * data['Area']).to_numpy()
m_E = (data['Edasi Mean Thickness'] * data['Area']).to_numpy()
s_E = (data['Edasi Thickness Std Dev'] * data['Area']).to_numpy()

N = len(data_1)

resample_rate = 0.9
Nsample = int(N1 * resample_rate)
X = []
vector_normal_rand = np.vectorize(norm.rvs)
for i in tqdm(range(1000)):
    
    # First, draw from the Bootstrap EDF
    edasi_sample=vector_normal_rand(m_E,s_E)
#     edasi_sample = np.zeros_like(m_E)
#     for i in range(len(edasi_sample)):
#         edasi_sample[i] = norm.rvs(loc=m_E[i], scale=s_E[i], size=1)
    
    mc_resample_F = m_F[np.random.choice(N, size=Nsample, replace=False)]
    mc_resample_E = m_E[np.random.choice(N, size=Nsample, replace=False)]
    
    sum_F = np.sum(mc_resample_F)
    sum_E = np.sum(mc_resample_E)

    X.append (sum_F-sum_E)

In [None]:
plt.plot(X,'ok')
plt.show()

In [None]:
plt.hist(np.array(X)/1e6 / resample_rate)
plt.show()

In [None]:
mu  = np.mean(X)/resample_rate/1e6
sig = np.std(X)/resample_rate/1e6
print(f'Mean of Differences = {mu:.2f}')
print(f'Std of Differences = {sig:.2f}')