# Tutorial 2.1
This notebook goes through the critical steps of the causal feature selection workflow on the SHIPS developmental data, similar to the first notebook of Part 1. Please run this notebook before you do anything else as the results produced in this notebook are reused in the subsequent Part 2 notebooks.

In [None]:
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nf
from netCDF4 import Dataset
%matplotlib inline
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import ast,gc,pickle
from copy import deepcopy
import pickle
import gc
from tqdm import tqdm
import numpy as np
import os

# Custom packages
import read_config
from util.data_process import read_vars, proc_dataset, miss
from util.models import performance_scores,train_baseline,causal_settings,train_PC1

  from .autonotebook import tqdm as notebook_tqdm


## Useful functions and settings

In [None]:
# A function to save models with pickle
def save_models(models, filename):
    with open(filename,'wb') as f:
        pickle.dump(models, f)


def read_pickle(filepath=None):
    with open(filepath, "rb") as f:
        x = pickle.load(f)
    return x


def flatten(xss):
    return [x for xs in xss for x in xs]

In [4]:
# Read configuration file
config_set = read_config.read_config()
#config_set = read_config.read_config('./config.ini')
# Define Target
if int(config_set['target_lag'])==20:
    target='delv120'
if int(config_set['target_lag'])==16:
    target='delv96'
if int(config_set['target_lag'])==12:
    target='delv72'
if int(config_set['target_lag'])==8:
    target='delv48'
if int(config_set['target_lag'])==4:
    target='delv24'
#seeds = np.arange(100,131,1)

## Loops through the 7 splits of SHIPSPLUS data (with causal predictors) and runs PC_stable in Tigramite for given pc_alpha values, creates results pkl inside results/4/shipsnew/

In [None]:
for split in range(7):  # Assuming 0 through 6
    # Load the processed time series data for the current split
    split_path = f'./proc/pickle/delv24/dict_split{split}.pkl'
    with open(split_path, 'rb') as f:
        TIDATA = pickle.load(f)
    # Tigramite needs the column names (variable names) of our data
    var_names = TIDATA['Xnorml']['train'][list(TIDATA['Xnorml']['train'].keys())[0]].columns
    # Define the initial causal relationships for the experiment. Refer to the tigramite tutorial if you would like to learn how to create a 
    # causal relationship dictionary of your own.
    onlyships_lag = causal_settings.link_onlyships(
        numvar=TIDATA['aligned_train'][list(TIDATA['aligned_train'].keys())[0]].shape[1],
        lag=4, #24 hours
        target_ind=[0], #The convention of our processed time series = the target (delv24) is always placed in the first columh
    )

    results = []
    # We loop through different pc_alpha settings to test the sensitivity of significance levels to our results
    for pc_alpha in tqdm([0.0001, 0.00015 ,0.001,0.0015,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,
                      0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]):
        # Tigramite does not recognize np.nan, so we change them to -999.0 (which tigramite recogizes as missing values)
        Xnorml_c = {
            'train': {ind: np.asarray(TIDATA['Xnorml']['train'][key].replace(np.nan, -999.0))
                      for ind, key in enumerate(TIDATA['Xnorml']['train'].keys())},
            'valid': {ind: np.asarray(TIDATA['Xnorml']['valid'][key].replace(np.nan, -999.0))
                      for ind, key in enumerate(TIDATA['Xnorml']['valid'].keys())},
            'test': {ind: np.asarray(TIDATA['Xnorml']['test'][key].replace(np.nan, -999.0))
                     for ind, key in enumerate(TIDATA['Xnorml']['test'].keys())}
        }
        # Here we run tigramite with the all of our settings. Our prediction task only uses information gathered at a given time and not past information to make predictions,
        # Thus, tau_min0 and tau_max0 are the same.
        result = train_PC1.Pipeline(
            Xnorml_c['train'],
            pc_alpha,
            pc_type='run_pcstable',
            tau_min0=int(config_set['tau_min']), 
            tau_max0=int(config_set['tau_max']),
            var_name=var_names,
            link_assumptions=onlyships_lag
        ).run_tigramite()

        del Xnorml_c
        gc.collect()
        results.append(result)

    savetos = {
        'dataframes': TIDATA['Xnorml'],
        'PC1_results': results,
        'var_names': var_names
    }

    output_dir = f'results/{int(config_set["target_lag"])}/shipsnew/'
    output_path = f'{output_dir}results_fold_{split}.pkl'

    os.makedirs(output_dir, exist_ok=True)
    with open(output_path, 'wb') as handler:
        pickle.dump(savetos, handler)

100%|██████████| 24/24 [01:00<00:00,  2.50s/it]
100%|██████████| 24/24 [01:03<00:00,  2.65s/it]
100%|██████████| 24/24 [01:05<00:00,  2.73s/it]
100%|██████████| 24/24 [00:57<00:00,  2.40s/it]
100%|██████████| 24/24 [01:06<00:00,  2.78s/it]
100%|██████████| 24/24 [00:59<00:00,  2.47s/it]
100%|██████████| 24/24 [00:51<00:00,  2.14s/it]


## Loops through the 7 splits of SHIPS developmental data and runs PC_stable in Tigramite for given pc_alpha values, creates results pkl inside results/4/shipsold/

The loop below is exactly the same as the loop to create the SHIPS+ results. The only difference is that we read in the original SHIPS developmental data (olddict*) for Tigramite calculation and not the new SHIPS+ data (dict_*) with the additional predictors.

In [9]:
for split in range(7):  # Assuming 0 through 6
    # Load the current split
    split_path = f'proc/pickle/delv24/olddict_split{split}.pkl'
    with open(split_path, 'rb') as f:
        TIDATA = pickle.load(f)

    var_names = TIDATA['Xnorml']['train'][list(TIDATA['Xnorml']['train'].keys())[0]].columns

    onlyships_lag = causal_settings.link_onlyships(
        numvar=TIDATA['aligned_train'][list(TIDATA['aligned_train'].keys())[0]].shape[1],
        lag=4,
        target_ind=[0],
    )

    results = []
    for pc_alpha in tqdm([0.0001, 0.00015, 0.001, 0.0015, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08,
                         0.09, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]):
        Xnorml_c = {
            'train': {ind: np.asarray(TIDATA['Xnorml']['train'][key].replace(np.nan, -999.0))
                      for ind, key in enumerate(TIDATA['Xnorml']['train'].keys())},
            'valid': {ind: np.asarray(TIDATA['Xnorml']['valid'][key].replace(np.nan, -999.0))
                      for ind, key in enumerate(TIDATA['Xnorml']['valid'].keys())},
            'test': {ind: np.asarray(TIDATA['Xnorml']['test'][key].replace(np.nan, -999.0))
                     for ind, key in enumerate(TIDATA['Xnorml']['test'].keys())}
        }

        result = train_PC1.Pipeline(
            Xnorml_c['train'],
            pc_alpha,
            pc_type='run_pcstable',
            tau_min0=int(config_set['tau_min']),
            tau_max0=int(config_set['tau_max']),
            var_name=var_names,
            link_assumptions=onlyships_lag
        ).run_tigramite()

        del Xnorml_c
        gc.collect()
        results.append(result)

    savetos = {
        'dataframes': TIDATA['Xnorml'],
        'PC1_results': results,
        'var_names': var_names
    }

    output_dir = f'results/{int(config_set["target_lag"])}/shipsold/'
    output_path = f'{output_dir}results_fold_{split}.pkl'

    os.makedirs(output_dir, exist_ok=True)
    with open(output_path, 'wb') as handler:
        pickle.dump(savetos, handler)

100%|██████████| 24/24 [00:44<00:00,  1.85s/it]
100%|██████████| 24/24 [00:42<00:00,  1.78s/it]
100%|██████████| 24/24 [00:40<00:00,  1.68s/it]
100%|██████████| 24/24 [00:33<00:00,  1.38s/it]
100%|██████████| 24/24 [00:49<00:00,  2.08s/it]
100%|██████████| 24/24 [00:41<00:00,  1.73s/it]
100%|██████████| 24/24 [00:37<00:00,  1.57s/it]
