In [None]:
# import dependencies
# import sys
# !{sys.executable} -m pip install mpl-scatter-density
import matplotlib as mplt
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import glacierml as gl
import numpy as np
import warnings
from tensorflow.python.util import deprecation
import os
import logging
import seaborn as sns
from tqdm import tqdm
from IPython.display import display, HTML
import geopy
from sklearn.cluster import KMeans
from scipy.stats import shapiro
import chart_studio as cs
from matplotlib.ticker import PercentFormatter
from scipy.stats import gaussian_kde
import mpl_scatter_density
from scipy.stats import gaussian_kde
import matplotlib.colors as mcolors
tf.random.set_seed(42)

# display(HTML("<style>.container { width:85% !important; }</style>"))
tf.get_logger().setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
deprecation._PRINT_DEPRECATION_WARNINGS = False
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.set_option('display.max_columns', None)

pd.set_option('mode.chained_assignment', None)

RS = range(0,25,1)

print('currently running tensorflow version: ' + tf.__version__)

# ML Analysis

In [None]:
parameterization = '4'
if parameterization == '1':
    threshold = 'No Threshold'
if parameterization == '2':
    threshold = '0.25'
if parameterization == '3':
    threshold = '0.50'
if parameterization == '4':
    threshold = '0.75'
dataset = gl.parameterize_data(parameterization)
dataset

In [None]:
# load deviations table 
rootdir = 'zults/'
predictions = pd.DataFrame()
statistics = pd.DataFrame()
file_reader = pd.read_csv(rootdir + 'model_statistics_' + parameterization + '.csv')
statistics = pd.concat([statistics, file_reader], ignore_index = True)

statistics = statistics.drop('Unnamed: 0', axis = 1)
statistics['total parameters'] = statistics['total parameters'].astype(int)
statistics['trained parameters'] = statistics['trained parameters'].astype(int)
statistics['total inputs'] = statistics['total inputs'].astype(int)
statistics['test - train'] = (
    abs(statistics['test mae avg'] - statistics['train mae avg']))
statistics['paramater ratio'] = statistics['trained parameters'] / statistics['total inputs']
statistics

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplots_adjust(hspace=0.5)
plt.suptitle('MAE vs. Model Inputs\n'+
             'Size and Distance Threshold = ' + threshold, fontsize = 25, y=.97
            )
fig.patch.set_facecolor('w')
plt.scatter(
    statistics['paramater ratio'],
    statistics['test mae avg']
)
# plt.xscale('log')
plt.ylabel('Bootstrap Test Mean Absolute Error (m)', fontsize = 18)
plt.xlabel('Trainable Parameters / Total Inputs', fontsize = 18)
plt.yticks(fontsize = 18)
plt.xticks(fontsize = 18)
plt.show()
# plt.savefig('figs/elbow/75.png')

In [None]:
# load selected model and look at predicted accuracies of each random state
# print('Please select index from statistics table to inspect further')

# selection = int(input())
selection = 0
arch = statistics['layer architecture'].loc[selection]


print(arch)

rootdir_1 = 'saved_results/' + parameterization + '/' + arch + '/'
# print(rootdir_1)
dnn_history = {}
fig,ax=plt.subplots(1,1,figsize=(15,15))
fig.patch.set_facecolor('w')
plt.suptitle(
    'Model Loss\nSize and Distance Thresholds = ' + threshold +
    ', Layer Architecture ' + arch , fontsize=18, y=1
            )


for n, rs in enumerate(RS): 
    ax = plt.subplot(5,5,n+1)
    history_name = (

        str(rs)
    )

    model_name = (

        str(rs)
    )

    dnn_history[model_name] = pd.read_csv(rootdir_1 + model_name)

    ax.set_title('Random State: ' + str(rs))
#     if abs((
#         dnn_history[model_name]['loss'].iloc[-1]
#     ) - dnn_history[model_name]['val_loss'].iloc[-1]) >= 3:
#         pass
#     else:
    gl.plot_loss(dnn_history[model_name])
plt.tight_layout()

In [None]:

dnn_model = {}
rootdir_2 = 'saved_models/' + parameterization + '/' + arch + '/'

train_features, test_features, train_labels, test_labels = gl.split_data(dataset)
features = pd.concat([train_features, test_features], ignore_index = True)
labels = pd.concat([train_labels, test_labels], ignore_index = True)

fig = plt.figure(figsize=(15, 15))
plt.subplots_adjust(hspace=0.5)
plt.suptitle(
    'Volume Estimates\nSize and Distance Thresholds = ' + threshold +
    ', Layer Architecture ' + arch , fontsize=18, y=1
            )
fig.patch.set_facecolor('w')

estimates = pd.DataFrame()

for n, rs in tqdm(enumerate(RS)):
    ax = plt.subplot(5, 5, n + 1)
    model_name = (
        str(rs)
    )   

    model_history = (
        str(rs)
    )  

    model_path = (
        rootdir_2 + 
        str(rs)
    )

    dnn_model[model_name] = tf.keras.models.load_model(model_path)
    dnn_history[model_name] = pd.read_csv(rootdir_1 + model_history)

#     if abs((
#         dnn_history[model_name]['loss'].iloc[-1]
#     ) - dnn_history[model_name]['val_loss'].iloc[-1]) >= 3:
#         pass
#     else:

    y = dnn_model[model_name].predict(features, verbose = 0)

    x = labels
    y = y.flatten()

    xy = np.vstack([np.log10(x),np.log10(y)])

    z = gaussian_kde(xy)(xy)


    plt.scatter(
        x,
        y,
        marker = '.',
        c = (z),
        s = 20,
        cmap = 'viridis'
    )
    plt.plot(
        (0,labels.max()),
        (0,labels.max()),
        '-',
        color = 'orange'    
    )
    plt.xlabel('True Thickness (m)')
    plt.ylabel('Model Thickness (m)')
    ax.set_title('Random State ' +str(rs))
#         plt.xlim((0,100))
#         plt.ylim((0,100))
    estimates = estimates.append(pd.Series(y.flatten(), name = rs))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (10,10))
plt.hist(estimates.mean() - labels, 20)
plt.xlabel('Thickness Difference')
plt.ylabel('Number of Elements')
plt.title('Histogram of Model Residuals')
mean = (estimates.mean() - labels).mean()
med = (estimates.mean() - labels).median()
mode = (estimates.mean() - labels).mode()
# plt.title('Log Percent Difference of Edasi & Lipovsky and Farinotti Glacier Volume')
# plt.axvline(mean, color='k', linestyle='dashed', linewidth=1)
# # plt.text(mean, 'Mean: {:.2f}'.format(mean))

# plt.axvline(med, color='k', linestyle='dashed', linewidth=1)
# # plt.text(med, 'Median: {:.2f}'.format(med))
# plt.axvline(mode[0], color='k', linestyle='dashed', linewidth=1)
# # plt.text((mode[0]), 'Mode: ' + str(np.round(mode[0], 2)))