In [None]:
AUGMENTATION_DATADIR = '../input/stanford-covid-vaccine-new-sequences-augmentation'
AUGMENTATION_PACKAGES = ['vienna_2', 'contrafold', 'rnasoft']
OUTDIR = 'output'

# KF side

## Setup

In [None]:
BATCHSIZE = 2
KF_MODELDIR = '../input/stanford-covid-vaccine-kf-models'
KF_MODELS = [
    '2200_v2_sw_conv_gru_h512',
    '2201_v3_sw_bpp_conv_gru_h512',
    '2201_v3_sw_bpp_conv_gru_h512l3',
    '2302_conv2d_emb_gru_aug-vc',
    '2302_conv2d_emb_lstm_aug-vc',
    '2304_conv2d_emb_gru_aug-cr',
    '2304_conv2d_emb_gru_aug-cr_sig0.5',
    '2304_conv2d_emb_gru_aug-vcr',
    '2304_conv2d_emb_gru_aug-vcr-10f',
    '2304_conv2d_emb_lstm_aug-cr',
    '2304_conv2d_emb_lstm_aug-cr_sig0.5',
    '2304_conv2d_emb_lstm_aug-vcr',
    '2304_conv2d_emb_lstm_aug-vcr-10f',
    '2304_conv2d_emb_lstm_aug-vcr_signal0.5',
    '2306_conv2d_emb_gru_aug-vcr-lw',
    '2306_conv2d_emb_gru_aug-vcr-lw0.1',
    '2306_conv2d_emb_lstm_aug-vcr-lw',
    '2306_conv2d_emb_lstm_aug-vcr-lw0.1',
    '2307_conv2d_emb_gru_aug-vcr-lw-nosw',
    '2307_conv2d_emb_lstm_aug-vcr-lw-nosw',
    '2308_conv2d_emb_gru_aug-vcr-lw-light',
    '2308_conv2d_emb_lstm_aug-vcr-lw-light',
]

In [None]:
%%capture
!pip install -q forgi
!pip install -q ../input/stanford-covid-vaccine-kf-packages/*

In [None]:
import gc
import os
import yaml
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

import covid_vaccine.preprocessing as P
import nncomp.registry as R
from nncomp.datasets import SequenceDataset

DEVICE = 0 if torch.cuda.is_available() else 'cpu'

## Preprocess

In [None]:
AUG_COLUMNS = [
    'id',
    'structure_ids',
    'predicted_loop_type_ids',
    'bpp',
]
preprocessor = P.CovidVaccineSequencePreprocessorV1(
    out_columns=[
        'sequence_ids',
        'structure_ids',
        'predicted_loop_type_ids',
        'mask',
    ]
)

target_df = pd.read_json(f"{AUGMENTATION_DATADIR}/original/dataset.json", lines=True)
target_df['bpp'] = [
    np.load(f"{AUGMENTATION_DATADIR}/original/bpps/{id}.npy").astype('float16')
    for id in target_df.id
]
target_df = preprocessor(target_df)
sample_submission_df = pd.read_csv('../input/stanford-covid-vaccine/post_deadline_files/new_sequences_submission.csv')

for pkg in AUGMENTATION_PACKAGES:
    datapath = Path(f"{AUGMENTATION_DATADIR}/{pkg}/dataset.json")
    if datapath.exists():
        df = pd.read_json(f"{AUGMENTATION_DATADIR}/{pkg}/dataset.json", lines=True)
    else:
        df = pd.read_csv(f"{AUGMENTATION_DATADIR}/{pkg}/dataset.csv")
        
    df['bpp'] = [
        np.load(f"{AUGMENTATION_DATADIR}/{pkg}/bpps/{id}.npy").astype('float16')
        for id in df.id
    ]
    df = preprocessor(df)
    target_df = target_df.merge(
        df[AUG_COLUMNS],
        on='id',
        suffixes=['', f"_{pkg}"]
    )
    del df
    gc.collect()

target_df.head()

## Inference

In [None]:
dataloader = torch.utils.data.DataLoader(
    SequenceDataset(
        target_df.reset_index(drop=True),
        padding_columns=dict(
            mask=0,
            sequence_ids=0,
            structure_ids=0,
            predicted_loop_type_ids=0,
            bpp=0,

            structure_ids_vienna_2=0,
            predicted_loop_type_ids_vienna_2=0,
            bpp_vienna_2=0,

            structure_ids_contrafold=0,
            predicted_loop_type_ids_contrafold=0,
            bpp_contrafold=0,

            structure_ids_rnasoft=0,
            predicted_loop_type_ids_rnasoft=0,
            bpp_rnasoft=0,
        ),
        non_padding_columns=[
            'id',
            'seq_length',
            'seq_scored',
        ],
        maxlen=target_df.seq_length.max(),
    ),
    drop_last=False, shuffle=False,
    batch_size=BATCHSIZE,
    num_workers= os.cpu_count(),    
)

In [None]:
Path(OUTDIR).mkdir(exist_ok=True, parents=True)

for name in tqdm(KF_MODELS):
    print(name)
    modeldir = Path(KF_MODELDIR) / name
    outputs = pd.DataFrame()
    for folddir in tqdm(list(modeldir.glob('fold=*'))):
        with open(list(folddir.glob('*.yml'))[0]) as f:
            config = yaml.safe_load(f)['params']['model']
        config['params'].pop('pretrained_model', None)
        model = R.ModelRegistry.get_instance(
            config['class'],
            **config['params']
        )
        ckpt = torch.load(list(folddir.glob('*.pth'))[0], map_location='cpu')
        model.load_state_dict(ckpt['model_state_dict'])
        model.eval()
        model.to(DEVICE)
        
        for batch in tqdm(dataloader):
            maxlen = int(batch['mask'].sum(dim=1).max())
            for key, value in batch.items():
                if torch.is_tensor(value):
                    if len(value.shape) == 1:
                        value = value[:maxlen]
                    elif len(value.shape) == 2:
                        value = value[range(len(value)), :maxlen]
                    elif len(value.shape) == 3:
                        value = value[range(len(value)), :maxlen, :maxlen]
                    else:
                        raise NotImplementedError()
                    batch[key] = value.to(DEVICE)
            with torch.no_grad():
                ys = model(**batch)
            output = pd.DataFrame({
                key.split('y_')[-1]: {
                    f"{id}_{i}": float(y_sample)
                    for id, y_sequence in zip(batch['id'], y_batch)
                    for i, y_sample in enumerate(y_sequence)
                }
                for key, y_batch in ys.items()
            })
            output.index.name = 'id_seqpos'
            outputs = outputs.append(output)
            
        model.to('cpu')
        torch.cuda.empty_cache()
        del model, ckpt
        gc.collect()
        
    outputs = outputs.groupby('id_seqpos').mean()
    outputs = outputs.loc[sample_submission_df.id_seqpos]
    outputs.to_csv(f"{OUTDIR}/{name}.csv.gz")

In [None]:
!ls -l $OUTDIR
pd.read_csv(list(Path(OUTDIR).iterdir())[0])