In [None]:
#Run this on HPC

import os
import sys
import pandas as pd
from pathlib import Path
import neuralforecast
import optuna
import torch
import torch.nn
from optuna.trial import TrialState
from optuna.samplers import TPESampler
from neuralforecast.auto import AutoTCN
from neuralforecast.models import TCN
from neuralforecast import NeuralForecast
from neuralforecast.losses.pytorch import HuberLoss
import lightning.pytorch as pl
import numpy as np
import csv
import pytorch_lightning as pl
import pickle

#Show all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Change directory to parent directory if necessary
if os.getcwd() == '/home/USACE_Modeling':
    None
else:
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

par = os.getcwd() #Parent Directory
par = Path(par)
sys.path.append(str(par))

In [None]:
#List of data files
data_model_dict = {1:'UARK_WCS_AIS_Compiled_NewData_No_Aggregation.csv', 2:'UARK_WCS_AIS_Compiled_NewData_Mixed.csv', 3:'UARK_WCS_AIS_Compiled_NewData_Self-Propelled, Dry Cargo.csv',4:'UARK_WCS_AIS_Compiled_NewData_Self-Propelled, Tanker.csv',5:'UARK_WCS_AIS_Compiled_NewData_Tug_Tow.csv'}

data_model_dict = {1:'UARK_WCS_AIS_Compiled_NewData_No_Aggregation.csv'}

In [None]:
class MetricsCallback(pl.Callback, pl.Trainer):
    def __init__(self, save_path):
        super().__init__()
        self.metrics = []
        self.save_path = save_path  # Path to save the CSV file
        self.epoch = []
        self.a_trn_loss = []
        self.a_val_loss = []
        self.a_val_MAE = {}
        self.a_val_RMSE = {}


    def on_init_end(self, trainer):
        self.a_trn_loss = np.ones(trainer.max_epochs) * np.inf
        self.a_val_loss = np.ones(trainer.max_epochs) * np.inf

    def on_validation_end(self, trainer, pl_module):
        self.epoch = np.append(self.epoch, trainer.current_epoch)        
        self.a_trn_loss.append(trainer.callback_metrics["train_loss"].item())
        self.a_val_loss.append(trainer.callback_metrics["valid_loss"].item())

    def on_train_end(self, trainer, pl_module):
        self.save_metrics_to_csv()

    def save_metrics_to_csv(self):
        csv_save_path = self.save_path + '/metrics.csv'
        with open(csv_save_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["epochs", "train_loss", "valid_loss"])
            for epoch in range(len(self.a_val_loss)):
                writer.writerow([epoch, self.a_trn_loss[epoch], self.a_val_loss[epoch]])

In [None]:
prediction_combined_list = []
prediction_combined_df = pd.DataFrame()
for i_filenumber in data_model_dict.keys():
    data_iden = i_filenumber
        
    study_name = data_model_dict[data_iden].replace('.csv','')
    
    def dataset(data_model_dir = data_model_dict,  data_num=data_iden, par_dir=par):    
    
        #Show all columns in dataframe
        pd.set_option('display.max_columns', None)
        begin_testing = '2020Q1'
        end_testing = '2020Q4'
        
        batch_size = 128  # set this between 32 to 128
        #Read Main Data
        wcs_df = pd.read_csv(par_dir / 'Data' / 'UARK_WCS_AIS_Compiled_NewData.csv')
        cons_loc = pd.read_csv(par_dir / 'Data' / 'location_with_consistent_data_newdata.csv')
        wcs_df = pd.merge(wcs_df,cons_loc,how='left',on='sub_folder')
        port_terminal = wcs_df[['sub_folder', 'folder']].drop_duplicates()
        
        #Read Data
        file_loc = par_dir / 'Data' / data_model_dir[data_num]
        df = pd.read_csv(file_loc)
        #Drop columns that start with 'dwell
        df = df.drop(columns=[col for col in df.columns if col.lower().startswith('dwell_'.lower())])
        
        df = df[df["quarter"] <= end_testing]
        df = pd.merge(df, port_terminal, on="sub_folder", how="left")
        df["ds"] = pd.PeriodIndex(df["quarter"], freq="Q").to_timestamp()
        # Rename some columns
        df.rename(columns={"sub_folder": "terminal", "QuarterOfTheYear": "quarter_of_year", "folder": "port"}, inplace=True)
        targets = [f"C_{i}" for i in range(1, 10)]
        ais_features = [col for col in df.columns if col.startswith("stop_count") or col.startswith("dwell_per_stop")]
        # Melt the DataFrame 'df' to a long format
        data = pd.melt(
            df,
            id_vars=[
                "terminal",
                "port",
                "ds",
                "quarter",
            ]
            + ais_features,
            value_vars=targets,
            var_name="commodity",
            value_name="y",
        )
        
        # Create a new column 'key' that combines the values in the 'port', 'terminal', and 'commodity' columns
        data["unique_id"] = data["port"].astype(str) + "|" + data["terminal"].astype(str) + "|" + data["commodity"].astype(str)
        #Removing outliers
        outlier_terminals = pd.read_csv(par_dir / 'Data' / 'outlier_terminals.csv')
        outlier_terminals_commodity = pd.read_csv(par_dir / 'Data' / 'outlier_terminals_commodity.csv')
        #Remove records from data where terminals are in outlier_terminals
        data = data[~data['terminal'].isin(outlier_terminals['terminal'])]
        #Remove records from data where key is in outlier_terminals_commodity
        data = data[~data['unique_id'].isin(outlier_terminals_commodity['unique_id'])]
        
        data["year"] = data["ds"].dt.year
        # Create four binary columns (Q1, Q2, Q3, Q4) based on the "ds" column
        data["Q1"] = (data["ds"].dt.quarter == 1).astype(int)
        data["Q2"] = (data["ds"].dt.quarter == 2).astype(int)
        data["Q3"] = (data["ds"].dt.quarter == 3).astype(int)
        data["Q4"] = (data["ds"].dt.quarter == 4).astype(int)
        
        fut_exog_features = ais_features + ["Q1", "Q2", "Q3", "Q4"]
        past_exog_features = fut_exog_features + ["y"]
        
        data["unique_id"] = data["port"].astype(str) + "|" + data["terminal"].astype(str) + "|" + data["commodity"].astype(str)
        data["year"] = data["ds"].dt.year
        
        #Drop columns terminal, port, quarter, commodity
        data.drop(columns=["terminal", "port", "commodity", "year"], inplace=True)
        
        train_df, test_df = data[data["quarter"] < begin_testing], data[data["quarter"] >= begin_testing]
        
        #Drop quarter column
        train_df.drop(columns=["quarter"], inplace=True)
        test_df.drop(columns=["quarter"], inplace=True)
        
        
        return dict(
            train_df=train_df,
            test_df=test_df,
            fut_exog_features=fut_exog_features,
            past_exog_features=past_exog_features,
        )
    dataset_return = dataset()
    train_df = dataset_return['train_df']
    test_df = dataset_return['test_df']
    fut_exog_features = dataset_return['fut_exog_features']
    past_exog_features = dataset_return['past_exog_features']
    
    #Get the folder name that starts with prediction_folder
    result_folder = par / 'Outputs' / 'TCN_Outputs' 
    folders = [f for f in os.listdir(result_folder) if os.path.isdir(os.path.join(result_folder, f))]
    best_trial_folder = [f for f in folders if f.startswith(study_name)][0]
    
    best_trial_path = par / 'Outputs' / 'TCN_Outputs' / best_trial_folder
    
    print('Best Trial is ' + str(best_trial_folder))
    
    #Load the best model
    load_tcn = NeuralForecast.load(path=best_trial_path, verbose=True)
    predictions = load_tcn.predict(futr_df=test_df, verbose=True).reset_index()
    results_df = pd.merge(predictions, test_df[["ds", "unique_id", "y"]], on=["ds", "unique_id"], how="left")
    #Rename y to Actuals, TCN to Predictions
    results_df.rename(columns={"y": "Actuals", "TCN": "Predictions"}, inplace=True)
    #Break unique_id into PSA, sub_folder, Commodity
    results_df[["PSA", "sub_folder", "Commodity"]] = results_df["unique_id"].str.split("|", expand=True)
    #Add column called Model
    results_df["Model"] = "TCN"
    #Add column Aggregation
    results_df["Aggregation"] = study_name.split('NewData_')[1]
    #Convert ds to quarter
    results_df["quarter"] = pd.PeriodIndex(results_df["ds"], freq="Q")
    #Drop ds
    results_df.drop(columns=["ds", "unique_id", "PSA"], inplace=True)
    
    #Append to prediction_combined_list
    prediction_combined_list.append(results_df)
    
#Concatenate all dataframes in prediction_combined_list
prediction_combined_df = pd.concat(prediction_combined_list)
print(prediction_combined_df.head())



In [None]:
#Export to csv. Change mode to 'w' if you want to overwrite
####prediction_combined_df.to_csv(par / 'Outputs' / 'TCN_Outputs' / 'TCN_Predictions.csv', index=False, mode='x')