In [None]:
import os
import sys
import warnings
#Seeds for reproducibility
import random as rn
import numpy as np
import matplotlib.pyplot as plt
SEED = 1234
np.random.seed(SEED)
rn.seed(SEED)
import datetime

warnings.filterwarnings("ignore")
import pandas as pd
import csv
import lightning.pytorch as pl
pl.seed_everything(SEED, workers=True)
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer, MultiNormalizer
from lightning.pytorch.loggers import CSVLogger
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting.metrics import MAE, SMAPE, MAPE, PoissonLoss, QuantileLoss
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from lightning.pytorch.strategies import FSDPStrategy
import torch
torch.manual_seed(SEED)
from pathlib import Path
import optuna
from optuna.study import MaxTrialsCallback
from optuna.trial import TrialState
import gc
#Change directory to parent directory if necessary
if os.getcwd() == '/home/USACE_Modeling':
    None
else:
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

par = os.getcwd() #Parent Directory
par = Path(par)
sys.path.append(str(par))

In [None]:
prediction_folder = 'Result_3'
#List of data files
data_model_dict = {1:'UARK_WCS_AIS_Compiled_NewData_No_Aggregation.csv', 2:'UARK_WCS_AIS_Compiled_NewData_Mixed.csv', 3:'UARK_WCS_AIS_Compiled_NewData_Self-Propelled, Dry Cargo.csv',4:'UARK_WCS_AIS_Compiled_NewData_Self-Propelled, Tanker.csv',5:'UARK_WCS_AIS_Compiled_NewData_Tug_Tow.csv'}

data_model_dict = {1:'UARK_WCS_AIS_Compiled_NewData_No_Aggregation.csv'}

prediction_combined_list = []
prediction_combined_df = pd.DataFrame()


for i_filenumber in data_model_dict.keys():    
    if os.getcwd() == '/home/USACE_Modeling':
        data_iden = int(sys.argv[1]) #For HPC
    else:
        data_iden=i_filenumber
            
    study_name = data_model_dict[data_iden].replace('.csv','')
    def dataset(data_model_dir = data_model_dict,  data_num=data_iden, par_dir=par):    
    
        #Show all columns in dataframe
        pd.set_option('display.max_columns', None)
        begin_testing = '2020Q1'
        end_testing = '2020Q4'
        
        batch_size = 32  # set this between 32 to 128
        #Read Main Data
        wcs_df = pd.read_csv(par_dir / 'Data' / 'UARK_WCS_AIS_Compiled_NewData.csv')
        cons_loc = pd.read_csv(par_dir / 'Data' / 'location_with_consistent_data_newdata.csv')
        wcs_df = pd.merge(wcs_df,cons_loc,how='left',on='sub_folder')
        port_terminal = wcs_df[['sub_folder', 'folder']].drop_duplicates()
        
        #Read Data
        file_loc = par_dir / 'Data' / data_model_dir[data_num]
        df = pd.read_csv(file_loc)
        #Drop columns that start with 'dwell
        df = df.drop(columns=[col for col in df.columns if col.lower().startswith('dwell_'.lower())])
        
        df = df[df["quarter"] <= end_testing]
        df = pd.merge(df, port_terminal, on="sub_folder", how="left")
        # Split the values in the 'quarter' column by "Q" and convert them to integers
        temp = df["quarter"].str.split("Q", expand=True).astype(int)
        # Calculate the number of quarters from the start of the data and add it as a new column 'quarter_from_start'
        df["year"] = temp[0]
        df["time_idx"] = (temp[0] - temp[0].iloc[0]) * 4 + temp[1] - 1
        df.rename(columns={"sub_folder": "terminal", "QuarterOfTheYear": "quarter_of_year", "folder": "port"}, inplace=True)
        target = [col for col in df.columns if col.startswith('C_')]
        ais_features = [col for col in df.columns if col.startswith("stop_count") or col.startswith("dwell_per_stop")]
        # Melt the DataFrame 'df' to a long format
        data = pd.melt(
            df,
            id_vars=[
                "terminal",
                "port",
                "quarter_of_year",
                "quarter",
                "year",
                "time_idx",
            ]
            + ais_features,
            value_vars=target,
            var_name="commodity",
            value_name="volume",
        )
        # Create a new column 'key' that combines the values in the 'port', 'terminal', and 'commodity' columns
        data["key"] = data["port"].astype(str) + "|" + data["terminal"].astype(str) + "|" + data["commodity"].astype(str)
        
        outlier_terminals = pd.read_csv(par_dir / 'Data' / 'outlier_terminals.csv')
        outlier_terminals_commodity = pd.read_csv(par_dir / 'Data' / 'outlier_terminals_commodity.csv')
        
        #Remove records from data where terminal is in outlier_terminals
        data = data[~data['terminal'].isin(outlier_terminals['terminal'])]
        #Remove records from data where key is in outlier_terminals_commodity
        data = data[~data['key'].isin(outlier_terminals_commodity['key'])]
        
        #Drop port, terminal, and commodity columns
        data = data.drop(columns=["port", "terminal", "commodity"])
        #Set quarter of year as string
        data['quarter_of_year'] = data['quarter_of_year'].astype(str)
        #Split into train and test
        train_df = data[data['quarter'] < begin_testing]
        test_df = data[data['quarter'] >= begin_testing]
        #Drop quarter
        train_df = train_df.drop(columns=['quarter'])
        test_df = test_df.drop(columns=['quarter'])
        max_prediction_length = 4
        max_encoder_length = 4
        training_cutoff = train_df["time_idx"].max() - max_prediction_length
            
        # Create training dataset
        training_ret = TimeSeriesDataSet(
            train_df[lambda x: x.time_idx <= training_cutoff],
            time_idx="time_idx",
            target="volume",
            group_ids=["key"],
            min_encoder_length=max_encoder_length,
            max_encoder_length=max_encoder_length,
            min_prediction_length=max_prediction_length,
            max_prediction_length=max_prediction_length,
            static_categoricals=["key"],
            time_varying_known_categoricals=["quarter_of_year"],
            time_varying_known_reals=ais_features + ["time_idx"],
            time_varying_unknown_reals=["volume"],
            target_normalizer=GroupNormalizer(groups=["key"], transformation="relu"),
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
        )
        
        # create validation set (predict=True) which means to predict the last max_prediction_length points in time for each series
        validation_ret = TimeSeriesDataSet.from_dataset(training_ret, train_df, predict=True, stop_randomization=True)
        
        # create dataloaders for model
        train_dataloader_ret = training_ret.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
        val_dataloader_ret = validation_ret.to_dataloader(train=False, batch_size=batch_size, num_workers=0)
        
        
        #Create dictionry to return
        return_dict = {'train_dataloader':train_dataloader_ret, 'val_dataloader':val_dataloader_ret, 'training':training_ret, 'test':test_df, 'train':train_df}
        return return_dict

    
    #Get values from dataset
    data_dict = dataset()
    train_dataloader = data_dict['train_dataloader']
    val_dataloader = data_dict['val_dataloader']
    training = data_dict['training']
    train_set = data_dict['train']
    test_set = data_dict['test']

    
    #Clear memory
    try:
        print("Clearing data_dict", flush=True)
        del data_dict
        gc.collect()
        torch.cuda.empty_cache()
    except:
        None
    # ### Load Best Model
    #Find file name that starts with study_name and ends with .ckpt
    for file in os.listdir(par / 'Outputs' / 'TFT_Outputs' / prediction_folder):
        if file.startswith(study_name) and file.endswith('.ckpt'):
            file_name = file
            
    print(file_name)
    
    tft = TemporalFusionTransformer.load_from_checkpoint(par / 'Outputs' / 'TFT_Outputs' / prediction_folder / file_name)
    
    #Stack test df at the end of train df
    test = pd.concat([train_set, test_set], ignore_index=True).fillna(0)
    
    #Create test dataset
    test_dataset = TimeSeriesDataSet.from_dataset(training, test, predict=True, stop_randomization=True)
    #Create dataloader for test dataset
    test_dataloader = test_dataset.to_dataloader(train=False, batch_size=32, num_workers=0)
    actuals = torch.cat([y[0] for x, y in iter(test_dataloader)])
    actual_values = actuals.tolist()
    #Get prediction resutls
    prediction  = tft.predict(test_dataloader, return_index=True)
    prediction_values = prediction.output.tolist()
    index_keys = prediction.index.key.tolist()
    prediction_df = pd.DataFrame({"key": index_keys, "Actuals": actual_values, "Predictions": prediction_values})
    prediction_df["quarter"] = [["2020Q1", "2020Q2", "2020Q3", "2020Q4"]] * prediction_df.shape[0]
    prediction_df[["folder", "sub_folder", "Commodity"]] = prediction_df["key"].str.split("|", expand=True)
    prediction_df.drop(columns=["key"], inplace=True)
    prediction_df = prediction_df.explode(["Actuals", "Predictions", "quarter"]).reset_index(drop=True)
    prediction_df['Model'] = 'TFT'
    prediction_df['Aggregation'] = study_name.split('NewData_')[1]
    #If quarter>=begin_testing, then prediction_df['Set'] = Testing
    # prediction_df['Set'] = np.where(prediction_df['quarter']>='2020Q1', 'Testing', 'Training')
    #Convert Actuals and Predictions to int
    prediction_df['Actuals'] = prediction_df['Actuals'].astype(int)
    prediction_df['Predictions'] = prediction_df['Predictions'].astype(int)
    #Drop folder
    prediction_df = prediction_df.drop(columns=['folder'])
    #Append to list
    prediction_combined_list.append(prediction_df)

prediction_combined_df = pd.concat(prediction_combined_list)
prediction_combined_df.head()

In [None]:
#Export to csv in Outputs folder. Change mode to 'w' if you want to overwrite
###prediction_combined_df.to_csv(par / 'Outputs' / 'TFT_Outputs' / prediction_folder / 'TFT_Predictions.csv', index=False, mode='x')

In [None]:
#Print model hyperparameters
storage_path = "Outputs/TFT_Outputs/" + prediction_folder + "/TFT_study.log"
storage_url = optuna.storages.JournalStorage(optuna.storages.JournalFileStorage(storage_path),)
study = optuna.study.load_study(study_name = study_name, storage=storage_url)
study_temp_df = study.trials_dataframe()
#Get best trial
best_trial = study_temp_df[study_temp_df['value']==study_temp_df['value'].min()]

#Keep row with first lowest value from best_trial
best_hp = best_trial[best_trial['value']==best_trial['value'].min()].iloc[0]
#Print best hyperparameters
print('Best hyperparameters:')
for key, value in best_hp.to_dict().items():
    print('{}: {}'.format(key, value))

In [None]:
#Feature importance
raw_predictions = tft.predict(test_dataloader, mode="raw", return_index=True)
interpretation = tft.interpret_output(raw_predictions.output, reduction="sum")
tft.plot_interpretation(interpretation)