# Push the model's prediction to Kaggle

### Load the model and the data


In [1]:
import pickle
import numpy as np
import pandas as pd
from lightning.pytorch import Trainer
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from dmsensei.core import DataModule

# model = pickle.load(open('model.pkl','rb'))
trainer = Trainer()
import torch
torch.manual_seed(0)
np.random.seed(0)

dm = DataModule(
            name=["ribonanza"],
            force_download=False,
            batch_size=256,
            num_workers=1,
            train_split=40,
            valid_split=40,
            predict_split=1.,
            overfit_mode=False,)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/yvesmartin/.pyenv/versions/3.11.6/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


Using local data for: ribonanza


Wrangling data for ribonanza: 100%|██████████| 49001/49001 [00:03<00:00, 14547.50it/s]


Using local data for: CT_files_pdbee


Wrangling data for CT_files_pdbee: 100%|██████████| 248/248 [00:00<00:00, 31810.98it/s]


Using local data for: sarah_supermodel


Wrangling data for sarah_supermodel: 100%|██████████| 107/107 [00:00<00:00, 15941.69it/s]


Using local data for: utr


Wrangling data for utr: 100%|██████████| 1234/1234 [00:00<00:00, 15615.16it/s]


Using local data for: SARS2


Wrangling data for SARS2: 100%|██████████| 38/38 [00:00<00:00, 12206.75it/s]


Using local data for: pri-miRNA


Wrangling data for pri-miRNA: 100%|██████████| 503/503 [00:00<00:00, 15342.41it/s]


In [8]:
indexes = dm.find_one_index_per_data_type('valid')
print(indexes)
for data, metadata in dm.val_dataloader():
    print(metadata['index'])
    for data_type in ['dms', 'shape']:
        if indexes[data_type] in set(metadata['index']):
            idx = metadata['index'].index(indexes[data_type])
            batch_idx = data[data_type]['index'][idx].item()
            print(idx, batch_idx)


{'dms': 27812, 'shape': 38924}
[27812, 38924, 40284, 47494, 14670, 4930, 6839, 16248, 48502, 5616, 37044, 26370, 17203, 4398, 7441, 40953, 32119, 33171, 21650, 19924, 43503, 24507, 1253, 23325, 6981, 48085, 38134, 8987, 11119, 10749, 28044, 6376, 24941, 33233, 20810, 45194, 28795, 40005, 11948, 3725]
0 0
1 2


In [None]:
# rebuild this into a series of lines

REFERENCE_METRIC = {
    'structure': 'f1', 
    'dms': 'mae',
    'shape': 'mae'
}

from dmsensei.core.metrics import metric_factory
from torch import tensor

lines = []
for a in dm.predict_dataloader():
    break
data, metadata = a

for idx in range(len(metadata['index'])):
    line = {}
    for k, v in metadata.items():
        line[k] = v[idx]
    # for k, v in predictions.items():
    #     line['pred_{}'.format(k)] = v[idx]
    lines.append(line)
for data_type, vals in data.items():
    for k,v in zip(vals['index'], vals['values'].tolist()):
        name = "true_{}".format(data_type) if data_type != 'sequence' else 'sequence'
        lines[k.item()][name] = v
        lines[k.item()][name.replace('true', 'pred')] = v
        
for data_type in ['dms','shape']:
    for line in lines:
        if not ('true_{}'.format(data_type) in line and 'pred_{}'.format(data_type) in line):
            continue
        line['score_{}'.format(data_type)] = metric_factory[REFERENCE_METRIC[data_type]](pred=tensor(line['pred_{}'.format(data_type)]), true=tensor(line['true_{}'.format(data_type)]), batch=False)
    
        


In [None]:
pd.DataFrame(lines)

In [None]:
data, metadata = a
# rebuild this into a series of lines
df_lines = pd.DataFrame(metadata)
for data_type, arr in data.items():
    df_lines = df_lines.merge(
        pd.DataFrame(
             pd.Series(arr['values'].tolist(), index=arr['index'].tolist()), columns=[data_type]),
         how='outer', left_index=True, right_index=True)
df_lines    

In [None]:


import json

json.dump(lines, open('t.json', 'w'))

In [None]:
metadata

In [None]:
metadata = pd.DataFrame(a[1])
metadata

In [None]:
a[0]['dms']['values'].tolist()[0]

In [None]:
pd.Series(a[0]['dms']['values'].tolist(), index=a[0]['dms']['index'].tolist())

### Run the prediction

In [None]:
prediction = trainer.predict(
        model,
        datamodule=DataModule(
            name=["ribo-test"],
            data='sequence',
            force_download=False,
            batch_size=256,
            num_workers=1,
            train_split=0,
            valid_split=0,
            predict_split=1.,
            overfit_mode=False,
        ),
    )

### Format the prediction and save it locally

In [None]:
import json

def format_to_ribonanza(prediction):
    
    # load data (EDIT THIS)
    data = json.load(open('/root/DMSensei/scripts/data/input_files/ribo-test/data.json'))
    
    # reformat into individual sequences
    arr = [p for batch in prediction for p in batch]
    
    # remove padding
    arr = [a[:s] for a,s in zip(arr, [len(d['sequence']) for d in data.values()])] 
    
    # stack into dataframe
    prediction = np.vstack(arr)
    return pd.DataFrame(prediction, columns=["reactivity_DMS_MaP", "reactivity_2A3_MaP"]).reset_index().rename(columns={"index": "id"})

format_to_ribonanza(prediction).to_csv('submission.csv', index=False)

### Push the prediction to Kaggle

Note: setup your Kaggle authentification first:
1. download your Kaggle API keys `kaggle.json` here: https://www.kaggle.com/settings/account
2. save it to `~/.kaggle/kaggle.json`:

    ```bash
    mv ~/Downloads/kaggle.json ~/.kaggle/kaggle.json
    ```
3. push your results to kaggle using the following line

In [None]:
os.system('kaggle competitions submit -c stanford-ribonanza-rna-folding -f submission.csv -m "test commit"')