In [89]:
import joblib
import os
import pandas as pd
from pathlib import Path
import signal
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [92]:
config = dict(
        epochs=1200,
        learning_rate=0.001,
        weight_decay=1e-5,
        dropout=0.05,
        shuffle=True,
        nprocs_filter=False,
        random_seed=1234,
        num_folds = 5,
        test_size=0.2
    )

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

config = ConfigStruct(**config)
config

<__main__.ConfigStruct at 0x2b52dd117df0>

In [94]:
MODEL_FILENAME = "SmoothL1Loss_fixed_Adamax_fewer_neurons_0.2_testSize_new_StandardScaler_2048_batch_0.05_dropout_pytorch_v1.12.tar"
MODEL_DIR = rf"/home/thes1067/models/blue_waters"
MODEL_PATH = Path(MODEL_DIR, MODEL_FILENAME)

In [95]:
DATASET_DIR = r"/home/thes1067/data/claix_dataset/data/claix"
DATASET_NAME = "claix_posix_npb_4_16_64_nprocs_Ciao_C_1288"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")
SYNTH_DATASET_NAME = "posix_synth_TVAE_70_batch_900_epochs.csv"
SYNTH_DATASET_PATH = Path(DATASET_DIR, SYNTH_DATASET_NAME).with_suffix(".csv")

In [97]:
MODEL_PATH.is_file()

True

## Load the data

In [101]:
df_claix_posix = pd.read_csv(DATASET_PATH)

In [103]:
df_claix_posix.head()

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_FDSYNCS,...,POSIX_F_VARIANCE_RANK_TIME,POSIX_F_VARIANCE_RANK_BYTES,uid,start_time,end_time,nprocs,jobid,lib_ver,hints,bandwidth
0,2288,2106,0,21053,26218,26541,185,-133,0,0,...,1.941565e-06,89324900.0,42282,2022-01-29 12:07:26,2022-01-29 12:11:42,24,209310,3.3.1,romio_no_indep_rw=true;cb_nodes=4,4420.63043
1,16112,12718,0,389198,35189,236638,1933,-43,0,0,...,27.83937,4.17609e+16,36826,2021-09-06 14:14:32,2021-09-06 14:15:38,240,23029701,3.1.8,romio_no_indep_rw=true;cb_nodes=4,407.677717
2,646,322,0,5184,161,1458,0,-163,0,0,...,2.515119e-06,0.0,36826,2021-09-06 11:00:54,2021-09-06 11:00:55,162,23022111,3.1.8,romio_no_indep_rw=true;cb_nodes=4,0.084217
3,2288,2106,0,21053,26218,26541,185,-133,0,0,...,4.719168e-07,89324900.0,42282,2022-01-31 14:36:36,2022-01-31 14:40:34,24,175062,3.3.1,romio_no_indep_rw=true;cb_nodes=4,41753.913256
4,2288,2106,0,21053,26218,26541,185,-133,0,0,...,3.402062e-07,89324900.0,42282,2022-01-10 11:02:43,2022-01-10 11:06:41,24,217152,3.3.1,romio_no_indep_rw=true;cb_nodes=4,50749.073598


### Drop the non-invariant columns

In [104]:
df_claix_posix = df_claix_posix.drop(['uid', 'jobid', 'hints', 'start_time', 'end_time', 'lib_ver'],
                                                               axis=1)

### Drop columns to match the Blue Waters dataset on which the model was trained

In [105]:
df_claix_posix = df_claix_posix.drop(['POSIX_FDSYNCS',
                                        'POSIX_RENAMED_FROM',
                                        'POSIX_F_VARIANCE_RANK_TIME',
                                        'POSIX_F_VARIANCE_RANK_BYTES'],
                                        axis=1)			

In [108]:
q25, q75 = np.percentile(df_claix_posix, 25), np.percentile(df_claix_posix, 75)
iqr = q75 - q25

In [110]:
bandwidth_q1 = df_claix_posix.bandwidth.quantile(0.25)
bandwidth_q3 = df_claix_posix.bandwidth.quantile(0.75)
bandwidth_iqr = bandwidth_q3 - bandwidth_q1

filter = (df_claix_posix.bandwidth >= bandwidth_q1 - 1.5 * bandwidth_iqr) & (df_claix_posix.bandwidth <= bandwidth_q3 + 1.5 * bandwidth_iqr)

bandwidth_outliers = df_claix_posix.loc[(filter)==False]
bandwidth_outliers

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth


### Load synth data

In [112]:
df_synth = pd.read_csv(SYNTH_DATASET_PATH)
df_bandwidth_synth = df_synth.pop('bandwidth')

### Separate bandwidth from input features

In [113]:
df_bandwidth_real = df_claix_posix.pop('bandwidth')

### Import Pytorch

In [114]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

In [115]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cpu


In [117]:
torch.manual_seed(1234)

<torch._C.Generator at 0x2b5237d5f850>

### Load the pre-trained model

In [118]:
model = nn.Sequential(
    nn.Linear(97, 2048),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(2048, 512),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(128, 1),
).to(device)

In [119]:
model.modules

<bound method Module.modules of Sequential(
  (0): Linear(in_features=97, out_features=2048, bias=True)
  (1): Dropout(p=0.05, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=2048, out_features=512, bias=True)
  (4): Dropout(p=0.05, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=512, out_features=128, bias=True)
  (7): Dropout(p=0.05, inplace=False)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)>

In [120]:
if Path(MODEL_PATH).is_file():
    print("Loading pretrained model...")

    checkpoint = torch.load(MODEL_PATH, map_location=torch.device(device))
    model.load_state_dict(checkpoint['model_state_dict'])
    model_epoch = checkpoint['epoch']

    print(f"Current epoch: {model_epoch}")

Loading pretrained model...
Current epoch: 599


### Reset the weights for the output layer for fine-tuning

In [None]:
output_layer = list(model.children())[-1]
output_layer.reset_parameters()

# Do the transfer learning

In [125]:
config = dict(
    epochs=1200,
    learning_rate=0.001,
    weight_decay=1e-5,
    dropout=0.05,
    scaling="StandardScaler",
    dataset="CLAIX",
    outliers=False,
    shuffle=True,
    loss="SmoothL1Loss",
    smooth_l1_loss_beta=1.0,
    optimizer="Adam",
    learning_rate_scheduler=True,
    pytorch_version=torch.__version__,
    test_size=0.2,
    fixed_test_loss_calc=True,
    drop_last=False,
    batch_norm=False,
    train_synth_data=True,
    synth_dataset=SYNTH_DATASET_NAME
)

## Split the Claix data into the train/test pair

In [127]:
X_train = df_synth
X_test = df_claix_posix
y_train = df_bandwidth_synth
y_test = df_bandwidth_real

In [129]:
len(X_train)

200

In [130]:
# ### Scale the input features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [131]:
tensor_X_train = torch.Tensor(X_train_scaled).to(device)
tensor_y_train = torch.Tensor(y_train.values).view(-1, 1).to(device) #Transform to a 2D array to avoid shape mismatch (gives errors)

training_dataset = TensorDataset(tensor_X_train, tensor_y_train)
# The dataset is so small we can process it whole in one batch
training_dataloader = DataLoader(training_dataset, batch_size=len(training_dataset), shuffle=config.shuffle)

In [132]:
# ### Scale the input features
X_test_scaled = scaler.transform(X_test)

In [133]:
tensor_X_test = torch.Tensor(X_test_scaled).to(device)
tensor_y_test = torch.Tensor(y_test.values).view(-1, 1).to(device) #Transform to a 2D array to avoid shape mismatch (gives errors)

test_dataset = TensorDataset(tensor_X_test, tensor_y_test)
# The dataset is so small we can process it whole in one batch
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset))

In [134]:
len(training_dataloader)

1

In [135]:
len(test_dataloader)

1

In [136]:
loss_fn = nn.SmoothL1Loss(reduction="sum").to(device)

In [138]:
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)

In [139]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True)

In [140]:
model_epoch = 0
model.train()

Sequential(
  (0): Linear(in_features=97, out_features=2048, bias=True)
  (1): Dropout(p=0.05, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=2048, out_features=512, bias=True)
  (4): Dropout(p=0.05, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=512, out_features=128, bias=True)
  (7): Dropout(p=0.05, inplace=False)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)

In [143]:
def train(epoch):
    size = len(training_dataloader)
    for batch, (X, y) in enumerate(training_dataloader):
        y_pred = model(X)
        
        # Divide the summed loss by the number of elements in the current batch to get the average loss
        loss = loss_fn(y, y_pred) / len(X)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss = loss.item()

    model.train()

In [144]:
def test():
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in test_dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item() 

    # Divide the summed test loss by the number of elements in the whole test dataset to get the average loss
    test_loss /= len(test_dataloader.dataset)

    return test_loss

In [145]:
for epoch in range(model_epoch, config.epochs):
    train(epoch)
    test_loss = test()
    print(f"Epoch {epoch+1} loss: Avg loss: {test_loss:>8f} \n")

    scheduler.step(test_loss)

    model_epoch = epoch

Epoch 1 loss: Avg loss: 25241.164894 

Epoch 2 loss: Avg loss: 24555.566489 

Epoch 3 loss: Avg loss: 23831.835106 

Epoch 4 loss: Avg loss: 23076.898936 

Epoch 5 loss: Avg loss: 22282.118351 

Epoch 6 loss: Avg loss: 21446.789894 

Epoch 7 loss: Avg loss: 20585.776596 

Epoch 8 loss: Avg loss: 19706.800532 

Epoch 9 loss: Avg loss: 18810.631649 

Epoch 10 loss: Avg loss: 17883.035904 

Epoch 11 loss: Avg loss: 16931.659574 

Epoch 12 loss: Avg loss: 16021.521277 

Epoch 13 loss: Avg loss: 15194.454787 

Epoch 14 loss: Avg loss: 14435.513298 

Epoch 15 loss: Avg loss: 13702.368351 

Epoch 16 loss: Avg loss: 12981.253989 

Epoch 17 loss: Avg loss: 12255.242021 

Epoch 18 loss: Avg loss: 11577.553191 

Epoch 19 loss: Avg loss: 11122.646941 

Epoch 20 loss: Avg loss: 11084.817819 

Epoch 21 loss: Avg loss: 11583.542553 

Epoch 22 loss: Avg loss: 12487.958777 

Epoch 23 loss: Avg loss: 13613.811170 

Epoch 24 loss: Avg loss: 15059.134309 

Epoch 25 loss: Avg loss: 16501.797872 

Epoch 26 