In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/test_old.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/train.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/test.csv
/kaggle/input/leap-atmospheric-physics-ai-climsim/sample_submission_old.csv


In [2]:
import gc
import os
import random
import time
import torch
import datetime
import numpy as np
import pandas as pd
import polars as pl
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset , DataLoader
from torchmetrics.regression import R2Score

In [3]:
DATA_PATH = "/kaggle/input/"
BATCH_SIZE = 12288
MIN_STD = 1e-6
SCHEDULER_PATIENCE = 3
SCHEDULER_FACTOR = 10**(-0.5)
EPOCHS = 50
PATIENCE = 6
PRINT_FREQ = 50

In [4]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
def seed_everything(seed_val = 1331):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [6]:
ts = time.time()

weights = pd.read_csv(DATA_PATH + "leap-atmospheric-physics-ai-climsim/sample_submission.csv", nrows=1)
del weights['sample_id']
weights = weights.T
weights = weights.to_dict()[0]
df_train = pl.read_csv(DATA_PATH + "leap-atmospheric-physics-ai-climsim/train.csv", n_rows = 2_500_000)


for target in weights:
    df_train = df_train.with_columns(pl.col(target).mul(weights[target]))

print("Time to read dataset:",format_time(time.time()-ts),flush=True)

FEAT_COLS = df_train.columns[1:557]
TARGET_COLS = df_train.columns[557:]

for col in FEAT_COLS:
    df_train = df_train.with_columns(pl.col(col).cast(pl.Float32))
for col in TARGET_COLS:
    df_train = df_train.with_columns(pl.col(col).cast(pl.Float32))

    
x_train = df_train.select(FEAT_COLS).to_numpy()
y_train = df_train.select(TARGET_COLS).to_numpy()

del df_train
gc.collect()

meanx = x_train.mean(axis=0)
stdx = np.maximum(x_train.std(axis=0),MIN_STD)
x_train = (x_train - meanx.reshape(1,-1)) / stdx.reshape(1,-1)

meany = y_train.mean(axis=0)
stdy = np.maximum(np.sqrt((y_train*y_train).mean(axis=0)),MIN_STD)
y_train = (y_train - meany.reshape(1,-1)) / stdy.reshape(1,-1)

print("Time after processing data:", format_time(time.time()-ts),flush = True)

Time to read dataset: 0:03:08
Time after processing data: 0:03:43


In [7]:
seed_everything()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
class NumpyDataset(Dataset):
    def __init__(self, x, y):
        assert x.shape[0] == y.shape[0], "Features and labels must have the same number of samples"
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        return torch.from_numpy(self.x[index]).float().to(device), torch.from_numpy(self.y[index]).float().to(device)

In [9]:
dataset = NumpyDataset(x_train, y_train)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(FFNN, self).__init__()
        
        layers = []
        previous_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(previous_size, hidden_size))
            layers.append(nn.LayerNorm(hidden_size))
            layers.append(nn.PReLU())
            layers.append(nn.Dropout(p=0.1))
            previous_size = hidden_size
        
        layers.append(nn.Linear(previous_size, output_size))
        
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [11]:
input_size = x_train.shape[1]
output_size = y_train.shape[1]
hidden_size = input_size + output_size
model = FFNN(input_size, [3*hidden_size, 2*hidden_size, 2*hidden_size, 2*hidden_size, 3*hidden_size], output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=SCHEDULER_FACTOR, patience=SCHEDULER_PATIENCE)

print("Time after all preparations:", format_time(time.time()-ts), flush=True)

Time after all preparations: 0:03:45


In [12]:
best_val_loss = float('inf')
best_model_state = None
patience_count = 0
r2score = R2Score(num_outputs=len(TARGET_COLS)).to(device)
for epoch in range(EPOCHS):
    print("")
    model.train()
    total_loss = 0
    steps = 0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        steps += 1

        if (batch_idx + 1) % PRINT_FREQ == 0:
            current_lr = optimizer.param_groups[0]["lr"]
            elapsed_time = format_time(time.time() - ts)
            print(f'  Epoch: {epoch+1}',\
                  f'  Batch: {batch_idx + 1}/{len(train_loader)}',\
                  f'  Train Loss: {total_loss / steps:.4f}',\
                  f'  LR: {current_lr:.1e}',\
                  f'  Time: {elapsed_time}', flush=True)
            total_loss = 0
            steps = 0
    

    model.eval()
    val_loss = 0
    y_true = torch.tensor([], device=device)
    all_outputs = torch.tensor([], device=device)
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            y_true = torch.cat((y_true, labels), 0)
            all_outputs = torch.cat((all_outputs, outputs), 0)
    r2=0
    r2_broken = []
    r2_broken_names = []
    for i in range(368):
        r2_i = r2score(all_outputs[:, i], y_true[:, i])
        if r2_i > 1e-6:
            r2 += r2_i
        else:
            r2_broken.append(i)
            r2_broken_names.append(FEAT_COLS[i])
    r2 /= 368

    avg_val_loss = val_loss / len(val_loader)
    print(f'\nEpoch: {epoch+1}  Val Loss: {avg_val_loss:.4f}  R2 score: {r2:.4f}')
    print(f'{len(r2_broken)} targets were excluded during evaluation of R2 score.')
    # print(r2_broken)
    # print(r2_broken_names, flush=True)
   
    scheduler.step(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()
        patience_count = 0
        print("Validation loss decreased, saving new best model and resetting patience counter.")
    else:
        patience_count += 1
        print(f"No improvement in validation loss for {patience_count} epochs.")
        
    if patience_count >= PATIENCE:
        print("Stopping early due to no improvement in validation loss.")
        break

del x_train, y_train
gc.collect()


  Epoch: 1   Batch: 50/184   Train Loss: 0.4890   LR: 1.0e-03   Time: 0:04:59
  Epoch: 1   Batch: 100/184   Train Loss: 0.3538   LR: 1.0e-03   Time: 0:06:14
  Epoch: 1   Batch: 150/184   Train Loss: 0.3435   LR: 1.0e-03   Time: 0:07:31

Epoch: 1  Val Loss: 0.3103  R2 score: 0.1139
221 targets were excluded during evaluation of R2 score.
Validation loss decreased, saving new best model and resetting patience counter.

  Epoch: 2   Batch: 50/184   Train Loss: 0.3139   LR: 1.0e-03   Time: 0:10:06
  Epoch: 2   Batch: 100/184   Train Loss: 0.2961   LR: 1.0e-03   Time: 0:11:22
  Epoch: 2   Batch: 150/184   Train Loss: 0.2845   LR: 1.0e-03   Time: 0:12:41

Epoch: 2  Val Loss: 0.2676  R2 score: 0.1589
201 targets were excluded during evaluation of R2 score.
Validation loss decreased, saving new best model and resetting patience counter.

  Epoch: 3   Batch: 50/184   Train Loss: 0.2669   LR: 1.0e-03   Time: 0:15:14
  Epoch: 3   Batch: 100/184   Train Loss: 0.2634   LR: 1.0e-03   Time: 0:16:32


0

In [13]:
model.load_state_dict(best_model_state)
model.eval()

df_test = pl.read_csv(DATA_PATH + "leap-atmospheric-physics-ai-climsim/test.csv")

for col in FEAT_COLS:
    df_test = df_test.with_columns(pl.col(col).cast(pl.Float32))

x_test = df_test.select(FEAT_COLS).to_numpy()

x_test = (x_test - meanx.reshape(1,-1)) / stdx.reshape(1,-1)

predt = np.zeros([x_test.shape[0], output_size], dtype=np.float32)

i1 = 0
for i in range(10000):
    i2 = np.minimum(i1 + BATCH_SIZE, x_test.shape[0])
    if i1 == i2:  # Break the loop if range does not change
        break

    # Convert the current slice of xt to a PyTorch tensor
    inputs = torch.from_numpy(x_test[i1:i2, :]).float().to(device)

    # No need to track gradients for inference
    with torch.no_grad():
        outputs = model(inputs)  # Get model predictions
        predt[i1:i2, :] = outputs.cpu().numpy()  # Store predictions in predt

    i1 = i2  # Update i1 to the end of the current batch

    if i2 >= x_test.shape[0]:
        break

for i in range(stdy.shape[0]):
    if stdy[i] < MIN_STD * 1.1:
        predt[:,i] = 0

predt = predt * stdy.reshape(1,-1) + meany.reshape(1,-1)

ss = pd.read_csv(DATA_PATH + "leap-atmospheric-physics-ai-climsim/sample_submission.csv")
ss.iloc[:,1:] = predt

del predt
gc.collect()

use_cols = []
for i in range(27):
    use_cols.append(f"ptend_q0002_{i}")

ss2 = pd.read_csv(DATA_PATH + "leap-atmospheric-physics-ai-climsim/sample_submission.csv")
df_test = df_test.to_pandas()
for col in use_cols:
    ss[col] = -df_test[col.replace("ptend", "state")]*ss2[col]/1200.

test_polars = pl.from_pandas(ss[["sample_id"]+TARGET_COLS])
test_polars.write_csv("submission.csv")

print("Total time:", format_time(time.time()-ts))


 -1.3907598e-05  3.8346374e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -1.9295094e-05  2.8388416e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -3.0173402e-05  6.0970531e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -5.6060402e-05  8.7055116e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -8.3859930e-05  1.0680124e-04]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -1.02507838e-04  9.82248166e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:] = predt
 -9.5299409e-05  6.1099898e-05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ss.iloc[:,1:

Total time: 4:14:20


#### 