Imports:

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
from torch import optim
from torch.utils.data import DataLoader

from benchmark.benchmark_results import cache_preds, generate_submission_file, cache_merged_submission_file, benchmark_results
from consts import CORE_SITES, JULY
from models.fit_to_data import ensemble_models, Ensemble_Type
from models.fitters import LSTMModel, lstm_fitter, StreamflowModel
from models.lstm_utils import HypParams, load_checkpoint, features2seqs, pad_collate_fn, train_lstm

In [2]:
def inv_scale_data(pred, mean, std):
    return pred * std + mean


Load model and data:

In [3]:
with open(os.path.join('..', 'assets', 'data', 'global_data_for_lstm_debugging_200sites_new_incl_test.pkl'), 'rb') as f:
    train_features, train_gt, val_features, val_gt, test_features = pickle.load(f)
n_feats = train_features.shape[1]

In [4]:
with open(os.path.join('..', 'assets', 'data', 'gt_mean_and_std.pkl'), 'rb') as f:
    gt_mean, gt_std = pickle.load(f)

In [6]:
hyperparams = HypParams(lr=1e-3, lr_step_size=5, lr_gamma=0.3, bs=32, n_epochs=50, n_hidden=2, hidden_size=512,
                        dropout_prob=0.3)
full_model = LSTMModel(input_size=80)
optimizer = optim.Adam(full_model.parameters(), lr=hyperparams.lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=hyperparams.lr_step_size, gamma=hyperparams.lr_gamma)

In [7]:
load_checkpoint(full_model, optimizer, scheduler,
                os.path.join('..', 'outputs', 'combined_set_lstm', 'epoch_30.pth.tar'))

30

In [8]:
hyperparams = HypParams(lr=1e-4, lr_step_size=50, lr_gamma=0.3, bs=2, n_epochs=5, n_hidden=2, hidden_size=512,
                        dropout_prob=0.3)

In [9]:
val_set = features2seqs(val_features, val_gt)
val_dloader = DataLoader(val_set, batch_size=hyperparams.bs, shuffle=True, collate_fn=pad_collate_fn)

In [10]:
full_model = train_lstm(val_dloader, None, full_model, hyperparams=hyperparams,
                        save_dir='combined_set_lstm_finetune')


Epoch [1/5], Training Loss: 0.1642
Epoch [2/5], Training Loss: 0.1456
Epoch [3/5], Training Loss: 0.1431
Epoch [4/5], Training Loss: 0.1426
Epoch [5/5], Training Loss: 0.1416
Saving final model...


Generate submission:

In [11]:
fitter = lstm_fitter
train_site_id_col = train_features.site_id.reset_index(drop=True)
val_site_id_col = val_features.site_id.reset_index(drop=True)
test_site_id_col = test_features.site_id
log_transform = False
gt_col = 'volume'

In [12]:
test_dfs = {}
val_dfs = {}
train_dfs = {}

In [13]:
def lstm_feat_adapter(X):
        dataset = features2seqs(X)
        dataloader = DataLoader(dataset, collate_fn=pad_collate_fn)

        return dataloader

model = StreamflowModel(full_model, adapter=lstm_feat_adapter)

In [14]:
for site_id in CORE_SITES:
    results_id = f'global_{fitter.__name__}_{site_id}'
    train_site = train_features[train_site_id_col == site_id]
    train_site_gt = train_gt[train_gt.site_id == site_id]
    val_site = val_features[val_site_id_col == site_id]
    val_site_gt = val_gt[val_gt.site_id == site_id]
    test_site = test_features[test_site_id_col == site_id]

    # Get relevant dates
    test_mask = test_site.date.dt.month <= JULY
    test_vals = test_site[test_mask]
    test_dates = test_vals.date.reset_index(drop=True).unique()
    val_mask = val_site.date.dt.month <= JULY
    val_vals = val_site[val_mask]
    val_dates = val_vals.date.reset_index(drop=True).unique()
    train_mask = train_site.date.dt.month <= JULY
    train_vals = train_site[train_mask]
    train_dates = train_vals.date.reset_index(drop=True).unique()

    # todo fix this for sitewise validation tests
    train_pred = pd.DataFrame()
    val_pred = pd.DataFrame()
    test_pred = pd.DataFrame()

    if not train_site.empty:
        train_pred = model(train_site)
    if not val_site.empty:
        val_pred = model(val_site)  # supposed to be train_only_model
    if not test_site.empty:
        test_pred = model(test_site)

    # rescaling data
    if log_transform:
        if not train_site.empty:
            train_pred = np.exp(inv_scale_data(train_pred, log_mean, log_std))
            train_site_gt[gt_col] = np.exp(inv_scale_data(train_site_gt[gt_col], log_mean, log_std))
        if not val_site.empty:
            val_pred = np.exp(inv_scale_data(val_pred, log_mean, log_std))
            val_site_gt[gt_col] = np.exp(inv_scale_data(val_site_gt[gt_col], log_mean, log_std))
        if not test_site.empty:
            test_pred = np.exp(inv_scale_data(test_pred, log_mean, log_std))
    else:
        if not train_site.empty:
            train_pred = inv_scale_data(train_pred, gt_mean, gt_std)
            train_site_gt[gt_col] = inv_scale_data(train_site_gt[gt_col], gt_mean, gt_std)
        if not val_site.empty:
            val_pred = inv_scale_data(val_pred, gt_mean, gt_std)
            val_site_gt[gt_col] = inv_scale_data(val_site_gt[gt_col], gt_mean, gt_std)
        if not test_site.empty:
            test_pred = inv_scale_data(test_pred, gt_mean, gt_std)

    train_site_gt = train_site_gt.reset_index(drop=True)
    val_site_gt = val_site_gt.reset_index(drop=True)

    # todo make sitewise validation work, how should we track the validation losses?
    benchmark_results(train_pred=train_pred, train_gt=train_site_gt[gt_col], val_pred=val_pred,
                      val_gt=val_site_gt[gt_col], benchmark_id=results_id)
    if not test_pred.empty:
        cache_preds(pred=test_pred, cache_id=results_id, site_id=site_id, pred_dates=test_dates, set_id='pred')
    if not val_pred.empty:
        cache_preds(pred=val_pred, cache_id=results_id, site_id=site_id, pred_dates=val_dates, set_id='val')
    if not train_pred.empty:
        cache_preds(pred=train_pred, cache_id=results_id, site_id=site_id, pred_dates=train_dates,
                    set_id='train')
print('Generating global model submission file...')
df_test = generate_submission_file(ordered_site_ids=CORE_SITES, model_id='global',
                                   fitter_id=fitter.__name__,
                                   set_id='pred')
train_ordered_site_ids = pd.Series(CORE_SITES)
train_ordered_site_ids = train_ordered_site_ids[train_ordered_site_ids.isin(train_features.site_id.unique())]
val_ordered_site_ids = pd.Series(CORE_SITES)
val_ordered_site_ids = val_ordered_site_ids[val_ordered_site_ids.isin(val_features.site_id.unique())]

df_val = generate_submission_file(ordered_site_ids=val_ordered_site_ids, model_id='global',
                                  fitter_id=fitter.__name__,
                                  set_id='val')
df_train = generate_submission_file(ordered_site_ids=train_ordered_site_ids, model_id='global',
                                    fitter_id=fitter.__name__,
                                    set_id='train')
test_dfs[f'global_{fitter.__name__}'] = df_test
val_dfs[f'global_{fitter.__name__}'] = df_val
train_dfs[f'global_{fitter.__name__}'] = df_train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_site_gt[gt_col] = inv_scale_data(train_site_gt[gt_col], gt_mean, gt_std)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_site_gt[gt_col] = inv_scale_data(val_site_gt[gt_col], gt_mean, gt_std)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_site_gt[gt_col] = inv_scale_data(train_site_g

Generating global model submission file...


In [15]:
test_val_train_global_dfs = [test_dfs, val_dfs, train_dfs]

In [16]:
labels = ['pred', 'val', 'train']
for idx, df in enumerate(test_val_train_global_dfs):
    label = labels[idx]

    full_dfs = df  # | global/local

    print(f'Dataset: {label}')

    final_df_dict = ensemble_models(full_dfs, 'final', ensemble_type=Ensemble_Type.BEST_PREDICTION)
    final_df = final_df_dict['final']
    cache_merged_submission_file(final_df, label)

Dataset: pred
hungry_horse_reservoir_inflow: global_lstm_fitter
snake_r_nr_heise: global_lstm_fitter
pueblo_reservoir_inflow: global_lstm_fitter
sweetwater_r_nr_alcova: global_lstm_fitter
missouri_r_at_toston: global_lstm_fitter
animas_r_at_durango: global_lstm_fitter
yampa_r_nr_maybell: global_lstm_fitter
libby_reservoir_inflow: global_lstm_fitter
boise_r_nr_boise: global_lstm_fitter
green_r_bl_howard_a_hanson_dam: global_lstm_fitter
taylor_park_reservoir_inflow: global_lstm_fitter
dillon_reservoir_inflow: global_lstm_fitter
ruedi_reservoir_inflow: global_lstm_fitter
fontenelle_reservoir_inflow: global_lstm_fitter
weber_r_nr_oakley: global_lstm_fitter
san_joaquin_river_millerton_reservoir: global_lstm_fitter
merced_river_yosemite_at_pohono_bridge: global_lstm_fitter
american_river_folsom_lake: global_lstm_fitter
colville_r_at_kettle_falls: global_lstm_fitter
stehekin_r_at_stehekin: global_lstm_fitter
detroit_lake_inflow: global_lstm_fitter
virgin_r_at_virtin: global_lstm_fitter
skagit