Typically, you split your data into training and test sets. You train your model with the training set then evaluate the result with test set. But you evaluated the model only once and you are not sure your good result is by luck or not. You want to evaluate the model multiple times so you can be more confident about the model's generalizability and prevent overfitting.

***K*-fold cross-validation:** 
- Shuffle the dataset randomly
- Split the dataset into *k* subsets (folds), let's say 3
- Train the model on *k*-1 folds, validate on the remaining fold, and repeat the process *k* times
  - Model 1: trained on Folds 1 and 2, tested on Fold 3, save the evaluation score
  - Model 2: trained on Folds 2 and 3, tested on Fold 1, save the evaluation score
  - Model 3: trained on Folds 3 and 1, tested on Fold 2, save the evaluation score
- Summarize the skill of the model using the sample of model evaluation scores

In [None]:
import torch
import warnings
import numpy as np
import xarray as xr
import proplot as pplt
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import r2_score,mean_squared_error
from torch.utils.data import TensorDataset,DataLoader,Subset
warnings.filterwarnings('ignore')

In [6]:
TESTSIZE    = 0.2
RANDOMSTATE = 42
NSPLITS     = 6
BATCHSIZE   = 322
INPUTSIZE   = 1
OUTPUTSIZE  = 1
HIDDENSIZE  = 64
ACTIVATION  = torch.nn.ReLU()
LEARNING    = 0.005
EPOCHS      = 6
DEVICE      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def load(filename,xname,yname,testsize=TESTSIZE,randomstate=RANDOMSTATE):
    filedir = '/global/cfs/cdirs/m4334/sferrett/monsoon-pod/data/processed'
    data = xr.open_dataset(f'{filedir}/{filename}')
    x = data[xname].load()
    y = data[yname].load()
    timeidxs = np.arange(x.time.size)
    trainidxs,testidxs = train_test_split(timeidxs,test_size=testsize,random_state=randomstate)
    xtrain = x[trainidxs]
    ytrain = y[trainidxs]
    xtest  = x[testidxs]
    ytest  = y[testidxs]
    return xtrain,ytrain,xtest,ytest

def normalize(array,mean=None,std=None):
    if mean is None:
        mean = np.mean(array)
    if std is None:
        std  = np.std(array)
    return (array-mean)/std,mean,std

def preprocess(x,y,training=True,normparams=None):
    xarray = x.values.flatten().astype(np.float64)
    yarray = y.values.flatten().astype(np.float64)
    if training:
        xnorm,xmean,xstd = normalize(xarray)
        ynorm,ymean,ystd = normalize(yarray)
        normparams = {'xmean':xmean,'xstd':xstd,'ymean':ymean,'ystd':ystd}
    else:
        if normparams is None:
            raise ValueError("'normparams' must be provided for validation and test sets.")
        xnorm,_,_ = normalize(xarray,normparams['xmean'],normparams['xstd'])
        ynorm,_,_ = normalize(yarray,normparams['ymean'],normparams['ystd'])
    xtensor = torch.FloatTensor(xnorm)
    ytensor = torch.FloatTensor(ynorm)
    return (xtensor,ytensor,normparams) if training else (xtensor,ytensor)

class MLP(torch.nn.Module):
    def __init__(self,inputsize,hiddensize,outputsize,activation):
        super(MLP,self).__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(inputsize,hiddensize),
            activation,
            torch.nn.Linear(hiddensize,hiddensize),
            activation,
            torch.nn.Linear(hiddensize,outputsize))
    def forward(self,x):
        return self.layers(x)

def train(model,dataloader,criterion,optimizer,device):
    model.train()
    epochloss = 0
    for batchinputs,batchtargets in dataloader:
        batchinputs,batchtargets = batchinputs.to(device),batchtargets.to(device)
        optimizer.zero_grad()
        batchoutputs = model(batchinputs)
        loss    = criterion(batchoutputs,batchtargets)
        loss.backward()
        optimizer.step()
        epochloss += loss.item()
    return epochloss/len(dataloader)

def evaluate(model,dataloader,criterion,device):
    model.eval()
    totalloss  = 0
    alltargets = []
    alloutputs = []
    with torch.no_grad():
        for inputs,targets in dataloader:
            inputs,targets = inputs.to(device),targets.to(device)
            outputs = model(inputs)
            loss    = criterion(outputs,targets)
            totalloss += loss.item()
            alltargets.extend(targets.cpu().numpy())
            alloutputs.extend(outputs.cpu().numpy())
    r2   = r2_score(alltargets,alloutputs)
    rmse = np.sqrt(mean_squared_error(alltargets,alloutputs))
    return totalloss/len(dataloader),r2,rmse,np.array(alltargets),np.array(alloutputs)

In [8]:
xtrain,ytrain,xtest,ytest = load(filename='LR_ERA5_IMERG_pr_bl_terms.nc',xname='subsat',yname='bl')
xtraintensor,ytraintensor,normparams = preprocess(xtrain,ytrain,training=True,normparams=False)
xtesttensor,ytesttensor   = preprocess(xtest,ytest,training=False,normparams=normparams)

In [9]:
traindataset = TensorDataset(xtraintensor.unsqueeze(1),ytraintensor.unsqueeze(1))
testdataset  = TensorDataset(xtesttensor.unsqueeze(1),ytesttensor.unsqueeze(1))
trainloader  = DataLoader(traindataset,batch_size=BATCHSIZE,shuffle=True)
testloader   = DataLoader(testdataset,batch_size=BATCHSIZE,shuffle=False)
kfold        = KFold(n_splits=NSPLITS,shuffle=True,random_state=RANDOMSTATE)

In [10]:
results        = []
alltrainlosses = []
allvallosses   = []
allyactual     = []
allypred       = []
for fold,(trainidx,validx) in enumerate(kfold.split(traindataset)):
    print(f'Fold {fold+1}/{NSPLITS} ________________________________________')
    trainsubset = Subset(traindataset,trainidx)
    valsubset   = Subset(traindataset,validx)
    trainloader = DataLoader(trainsubset,batch_size=BATCHSIZE,shuffle=True)
    valloader   = DataLoader(valsubset,batch_size=BATCHSIZE,shuffle=False)
    model     = MLP(INPUTSIZE,HIDDENSIZE,OUTPUTSIZE,ACTIVATION).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(),lr=LEARNING)
    criterion = torch.nn.MSELoss()
    trainlosses = []
    vallosses   = []
    for epoch in range(EPOCHS):
        trainloss = train(model,trainloader,criterion,optimizer,DEVICE)
        valloss,valr2,valrmse,_,_ = evaluate(model,valloader,criterion,DEVICE)
        trainlosses.append(trainloss)
        vallosses.append(valloss)
        print(f'Epoch {epoch+1}/{EPOCHS} - Train Loss: {trainloss:.4f}, Val Loss: {valloss:.4f}')
    alltrainlosses.append(trainlosses)
    allvallosses.append(vallosses)
    results.append({'fold':fold,'loss':valloss,'r2':valr2,'rmse':valrmse})

Fold 1/6 ________________________________________
Epoch 1/6 - Train Loss: 0.0363, Val Loss: 0.0360
Epoch 2/6 - Train Loss: 0.0352, Val Loss: 0.0350
Epoch 3/6 - Train Loss: 0.0351, Val Loss: 0.0347
Epoch 4/6 - Train Loss: 0.0351, Val Loss: 0.0350
Epoch 5/6 - Train Loss: 0.0350, Val Loss: 0.0371
Epoch 6/6 - Train Loss: 0.0350, Val Loss: 0.0349
Fold 2/6 ________________________________________
Epoch 1/6 - Train Loss: 0.0362, Val Loss: 0.0348
Epoch 2/6 - Train Loss: 0.0353, Val Loss: 0.0350
Epoch 3/6 - Train Loss: 0.0352, Val Loss: 0.0350
Epoch 4/6 - Train Loss: 0.0351, Val Loss: 0.0347
Epoch 5/6 - Train Loss: 0.0350, Val Loss: 0.0347
Epoch 6/6 - Train Loss: 0.0349, Val Loss: 0.0347
Fold 3/6 ________________________________________
Epoch 1/6 - Train Loss: 0.0367, Val Loss: 0.0349
Epoch 2/6 - Train Loss: 0.0352, Val Loss: 0.0352
Epoch 3/6 - Train Loss: 0.0351, Val Loss: 0.0358
Epoch 4/6 - Train Loss: 0.0351, Val Loss: 0.0352


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ffb24060190>>
Traceback (most recent call last):
  File "/global/homes/s/sferrett/.conda/envs/monsoon-sr/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
avgloss = np.mean([r['loss'] for r in results])
avgr2   = np.mean([r['r2'] for r in results])
avgrmse = np.mean([r['rmse'] for r in results])
print(f'Average Cross-Validation Results:')
print(f'  Average Loss: {avgloss:.4f}')
print(f'  Average R2: {avgr2:.4f}')
print(f'  Average RMSE: {avgrmse:.4f}')

In [None]:
colors = ['red6','orange6','yellow6','green6','blue6','violet6']
fig,ax = pplt.subplots(nrows=1,ncols=1,refwidth=5,refheight=2)
ax.format(suptitle='Training and Validation Losses',xlabel='Epoch',xlim=(0,5),xticks=1,ylabel='Loss')
for fold,color in zip(range(NSPLITS),colors):
    ax.plot(alltrainlosses[fold],color=color,linestyle='--',linewidth=1,label=f'Fold {fold+1} Training')
    ax.plot(allvallosses[fold],color=color,linewidth=1,label=f'Fold {fold+1} Validation')
ax.legend(loc='r',ncols=1)
pplt.show()

In [None]:
testloss,testr2,testrmse,testtargets,testpreds = evaluate(model,testloader,criterion,DEVICE)
print(f'Test Results:')
print(f'  Loss: {testloss:.4f}, R2: {testr2:.4f}, RMSE: {testrmse:.4f}')

In [None]:
def denormalize(normtensor,mean,std):
    return np.array(normtensor)*std+mean

ytrue = denormalize(testtargets,normparams['ymean'],normparams['ystd'])
ypred = denormalize(testpreds,normparams['ymean'],normparams['ystd'])

In [11]:
fig,axs = pplt.subplots(nrows=1,ncols=2,refwidth=2,share=False)
axs[0].format(title='Actual vs. Predicted $\mathit{B_L}$',xlabel='Actual $\mathit{B_L}$ (m/s$^2$)',ylabel='Predicted $\mathit{B_L}$ (m/s$^2$)')
axs[1].format(title='Actual vs. Predicted Histograms of $\mathit{B_L}$',xlabel='$\mathit{B_L}$ (m/s$^2$)',ylabel='Count',yscale='log',yformatter='log')
axs[0].scatter(ytrue,ypred,color='cyan6',marker='.',markersize=10,alpha=0.5)
axs[0].plot([min(min(ytrue),min(ypred)),max(max(ytrue),max(ypred))],[min(min(ytrue),min(ypred)),max(max(ytrue),max(ypred))],'k--')
axs[0].text(0.05,0.95,f'R² = {r2_score(ytrue,ypred):.3f}',color='cyan9',transform=axs[0].transAxes,verticalalignment='top',horizontalalignment='left')
axs[1].hist(ytrue,bins=50,filled=True,facecolor='none',edgecolor='k',linewidth=1.5,label='Actual')
axs[1].hist(ypred,bins=50,filled=True,color='cyan6',alpha=0.5,label='Predicted')
axs[1].legend(loc='ul',ncols=1)
pplt.show()