In [1]:
import rtdl
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Load the data
import pandas as pd

train_path = '../../zipped/train.csv'
dev_in_path = '../../zipped/dev_in.csv'

df_train = pd.read_csv(train_path)
df_train.head()


Unnamed: 0,fact_time,fact_latitude,fact_longitude,fact_temperature,fact_cwsm_class,climate,topography_bathymetry,sun_elevation,climate_temperature,climate_pressure,...,cmc_0_1_66_0_grad,cmc_0_1_66_0_next,cmc_0_1_67_0_grad,cmc_0_1_67_0_next,cmc_0_1_68_0_grad,cmc_0_1_68_0_next,gfs_2m_dewpoint_grad,gfs_2m_dewpoint_next,gfs_total_clouds_cover_low_grad,gfs_total_clouds_cover_low_next
0,1543321000.0,26.9688,-99.248901,2.0,0.0,dry,127.0,-17.526443,14.613571,754.263405,...,0.0,0.0,0.0,0.0,0.0,0.0,-2.600006,-2.750006,0.0,0.0
1,1538776000.0,29.374201,-100.927002,31.0,20.0,mild temperate,297.0,41.531032,26.992143,733.117168,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.600006,17.950006,-12.0,11.0
2,1552115000.0,22.149599,113.592003,17.0,10.0,mild temperate,-1.0,43.916531,18.842143,761.571076,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.233978,21.450006,1.0,8.0
3,1549566000.0,34.678699,-86.684799,24.0,20.0,mild temperate,193.0,40.240955,8.303571,747.52491,...,0.0,0.0,0.0,0.0,0.0,0.0,0.059448,16.150018,-58.0,41.0
4,1552910000.0,46.066667,41.966667,9.0,20.0,dry,90.0,30.39466,6.451429,753.168113,...,0.0,0.0,0.0,0.0,0.0,0.0,0.400024,3.150018,18.0,92.0


In [3]:
df_dev_in = pd.read_csv(dev_in_path)
df_dev_in.head()

Unnamed: 0,fact_time,fact_latitude,fact_longitude,fact_temperature,fact_cwsm_class,climate,topography_bathymetry,sun_elevation,climate_temperature,climate_pressure,...,cmc_0_1_66_0_grad,cmc_0_1_66_0_next,cmc_0_1_67_0_grad,cmc_0_1_67_0_next,cmc_0_1_68_0_grad,cmc_0_1_68_0_next,gfs_2m_dewpoint_grad,gfs_2m_dewpoint_next,gfs_total_clouds_cover_low_grad,gfs_total_clouds_cover_low_next
0,1539162000,-40.35,-9.88,11.0,10,tropical,-843.0,31.78249,10.070714,765.631228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.505035,2.647577,2.0,2.0
1,1545006600,53.421299,-6.27007,4.0,10,mild temperate,67.0,-59.691521,7.005,752.897615,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.400024,1.249994,0.0,0.0
2,1540094400,-19.7577,63.361,26.0,10,dry,6.0,35.250889,23.327143,763.115016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.100006,21.050012,-1.0,1.0
3,1552611600,35.245899,47.009201,5.0,10,mild temperate,1390.0,-23.755615,3.109286,609.419333,...,1.69672,5.1653,0.0,4.9e-05,0.0,0.0,-1.5,-0.349982,-12.0,81.0
4,1545631200,26.633333,118.15,14.0,20,mild temperate,210.0,33.040438,12.172143,734.678037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.102081,11.513879,-15.0,83.0


In [4]:
dev_out_path = '../../data/dev_out.csv'
eval_in_path = '../../data/test_in.csv'
eval_out_path = '../../data/test_out.csv'

df_dev_out = pd.read_csv(dev_out_path)
df_eval_in = pd.read_csv(eval_in_path)
df_eval_out = pd.read_csv(eval_out_path)

In [5]:
# Identify the categorical features
cat_features = []
for col in df_dev_in:
    values = df_dev_in[col].tolist()
    unique = list(dict.fromkeys(values))
    if len(unique) < 20:
        cat_features.append(col)
print(cat_features)

['fact_cwsm_class', 'climate', 'cmc_available', 'gfs_available', 'gfs_soil_temperature_available', 'gfs_timedelta_s', 'wrf_available']


In [6]:
# !pip install scipy
from scipy import stats
import numpy as np

nan_replacements = {}
for col in df_train:
    if col in cat_features:
        nan_replacements[col] = stats.mode(np.asarray(df_train[col].tolist()))[0][0]
        # print(nan_replacements[col])
    else:
        nan_replacements[col] = np.mean(np.asarray(df_train[col].dropna().tolist()))
print(nan_replacements.values())

dict_values([1545219635.2602534, 27.3491997995972, -19.454721235367906, 15.030736293991083, 0.0, 'mild temperate', 309.93658796685, -2.7249874731371273, 15.053651434869764, 731.2756530851661, 289.1659257771938, 0.0513848381221949, 287.04881832634976, 287.0853175734476, 287.0339327353252, 258.9922501121575, 274.97475431188, 282.77351197349384, 285.8909051108904, 281.3276019215079, 7.660611531165116, 5.534873822269684, 14.704511863943937, 12.971739936778564, 8.78866233485156, 7.2775056043593676, 0.009529923266778687, 0.00859610917297107, 6.413124162323838, 0.2673820249969054, 0.027512247376225554, 0.008222988603064724, 3.866966535018241e-05, -0.0020003342605871715, -0.05948973513452053, 10.42531664950694, 5.2929354486362215, 1.8376276146288066, 0.4768480677913728, -0.09701087752828826, -0.14489588205685547, 0.22814633531193007, 0.2219977889627698, 0.30237136633108197, 0.18710502254215602, 97345.58898593165, 97344.2322319218, 101661.80541725569, 137.95607755299883, 5728.521789532874, 3094

In [7]:
# Replace nans
for col in df_train:
    df_train[col] = df_train[col].fillna(nan_replacements[col])
    df_dev_in[col] = df_dev_in[col].fillna(nan_replacements[col])
    df_dev_out[col] = df_dev_out[col].fillna(nan_replacements[col])
    df_eval_in[col] = df_eval_in[col].fillna(nan_replacements[col])
    df_eval_out[col] = df_eval_out[col].fillna(nan_replacements[col])

In [8]:
# !pip install scikit-learn

from typing import Dict

import numpy as np
import sklearn.preprocessing


def normalize(
    X: Dict[str, np.ndarray], normalization: str, seed: int, noise: float = 1e-3
) -> Dict[str, np.ndarray]:
    # X ~ {'train': <train_size x n_features>, 'val': <val_size x n_features>, 'test': <test_size x n_features>}
    X_train = X['train']
    if normalization == 'standard':
        normalizer = sklearn.preprocessing.StandardScaler()
    elif normalization == 'quantile':
        normalizer = sklearn.preprocessing.QuantileTransformer(
            output_distribution='normal',
            n_quantiles=max(min(X['train'].shape[0] // 30, 1000), 10),
            subsample=1e9,
            random_state=seed,
        )
        if noise:
            X_train = X_train.copy()
            stds = np.std(X_train, axis=0, keepdims=True)
            noise_std = noise / np.maximum(stds, noise)
            X_train += noise_std * np.random.default_rng(seed).standard_normal(
                X_train.shape
            )
    else:
        raise ValueError(f'unknown normalization: {normalization}')
    normalizer.fit(X_train)
    return {k: normalizer.transform(v) for k, v in X.items()}

In [31]:
# Set Seed
seed = 10
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

In [32]:
# Normalise using train data stats
# Quantile normalisation is used (maps to a normal distribution)
df_test = df_eval_out
X_train_np = np.asarray(df_train.iloc[:,6:])
X_test_np = np.asarray(df_test.iloc[:,6:])
X = {'train': X_train_np, 'test': X_test_np}
X = normalize(X, normalization='quantile', seed=seed)
X_test_np = X['test']
X_test = torch.FloatTensor(X_test_np)

In [33]:
# Preprocess into tensors

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

def get_lab_to_ind(data_df):
    '''
    Prepare a label to index map
    '''
    y_fact = set(list(data_df['fact_cwsm_class']))
    lab_to_ind = {}
    for i, lab in enumerate(y_fact):
        lab_to_ind[lab] = i
    return lab_to_ind

lab_to_ind = get_lab_to_ind(df_train)
batch_size = 256

# Test
y_test = df_test['fact_cwsm_class']
y_test = torch.LongTensor(np.asarray([lab_to_ind[lab] for lab in y_test]))

test_ds = TensorDataset(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [34]:
# Get the device

def get_default_device():
#     # Force cpu for now
#     return torch.device('cpu')
    if torch.cuda.is_available():
        print("Got CUDA!")
        return torch.device('cuda')
    else:
        print("No CUDA found")
        return torch.device('cpu')

device = get_default_device()

Got CUDA!


In [35]:
# Create the Feature Transformer Model

model = rtdl.FTTransformer.make_default(
    n_num_features=X_test.shape[1],
    cat_cardinalities=None,
    last_layer_query_idx=[-1],
    d_out=len(lab_to_ind)
)

model_path = f'./trained_models/FTTransformer/model{seed}.th'
model.load_state_dict(torch.load(model_path))
model.eval().to(device)

FTTransformer(
  (feature_tokenizer): FeatureTokenizer(
    (num_tokenizer): NumericalFeatureTokenizer()
  )
  (cls_token): CLSToken()
  (transformer): Transformer(
    (blocks): ModuleList(
      (0): ModuleDict(
        (attention): MultiheadAttention(
          (W_q): Linear(in_features=192, out_features=192, bias=True)
          (W_k): Linear(in_features=192, out_features=192, bias=True)
          (W_v): Linear(in_features=192, out_features=192, bias=True)
          (W_out): Linear(in_features=192, out_features=192, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ffn): FFN(
          (linear_first): Linear(in_features=192, out_features=512, bias=True)
          (activation): ReGLU()
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_second): Linear(in_features=256, out_features=192, bias=True)
        )
        (attention_residual_dropout): Dropout(p=0.0, inplace=False)
        (ffn_residual_dropout): Dropout(p=0.0, inplace=False)


In [36]:
# Create pipeline to apply model
def apply_model(model, x_num, x_cat=None):
    '''
    FTTransformer expects numerical and categorical inputs separately
    '''
    return model(x_num, x_cat) if isinstance(model, rtdl.FTTransformer) else model(x_num)

In [37]:
@torch.no_grad()
def eval(val_loader, model, device):
    '''
    Run evaluation
    '''
    # switch to eval mode
    model.eval()
    preds = []
    
    for i, (x, target) in enumerate(val_loader):

        x = x.to(device)
        target = target.to(device)

        # Forward pass
        logits = apply_model(model, x)
        logits = logits.detach().cpu().numpy().tolist()
        preds += logits
    
    return preds

In [38]:
# Performance metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve

def metric_accuracy(preds, targets):
    preds = np.asarray(preds)
    targets = np.asarray(targets)
    pred_inds = np.argmax(np.asarray(preds), axis=1)
    return accuracy_score(targets, pred_inds)

def get_avg_f1(preds, labels):
    '''
    Calculate one-vs-all f1 score per class
    Return average of f1 scores over all classes
    preds: [num_samples x num_classes]
    '''
    f1s = []
    label_inds = labels
    class_inds_to_check = list(set(label_inds))

    for class_ind_to_check in class_inds_to_check:
        y_true = []
        y_pred = []
        for pred, lab_ind in zip(preds, label_inds):
            y_pred.append(pred[class_ind_to_check])
            if lab_ind == class_ind_to_check:
                y_true.append(1)
            else:
                y_true.append(0)
        precision, recall, _ = precision_recall_curve(y_true, y_pred)
        f_scores = (2*precision*recall)/(precision+recall)
        f_scores_clean = f_scores[np.logical_not(np.isnan(f_scores))]
        f1s.append(np.amax(f_scores_clean))
    return np.mean(np.asarray(f1s))
    

In [39]:
# evaluate on test set
preds = eval(test_dl, model, device)
targets = [lab_to_ind[lab] for lab in df_test['fact_cwsm_class']]

In [40]:
out_dir = './predictions/'
dataset = 'eval_out/'
# save targets
np.save(out_dir+dataset+'targets.npy', np.asarray(targets))
# save predictions to file
np.save(out_dir+dataset+str(seed)+'.npy', np.asarray(preds))

In [41]:
accuracy = metric_accuracy(preds, targets)
f_macro = get_avg_f1(preds, targets)
print(accuracy)
print(f_macro)



0.4642073024802905
0.2876467428294247


