Reference from https://www.kaggle.com/t88take/openvaccine-simple-lgb-baseline/notebook,
https://www.kaggle.com/suresk/fastai2-starter?scriptVersionId=42023177

In [None]:
# Install the fastai v2 dataset

!pip uninstall fastai -y
!pip install /kaggle/input/fast-v2-offline/dataclasses-0.6-py3-none-any.whl
!pip install /kaggle/input/fast-v2-offline/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install /kaggle/input/fast-v2-offline/torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install /kaggle/input/fast-v2-offline/fastcore-1.0.1-py3-none-any.whl
!pip install /kaggle/input/fast-v2-offline/fastai-2.0.8-py3-none-any.whl

from fastai.tabular.all import *

import numpy as np
import pandas as pd

In [None]:
train = pd.read_json('../input/stanford-covid-vaccine/train.json',lines=True)

test = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
submission = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

# test_x = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
#train_y = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In [None]:
train_data = []
for mol_id in train['id'].unique():
    sample_data = train.loc[train['id'] == mol_id]
    sample_seq_length = sample_data.seq_length.values[0]
    
    for i in range(68):
        sample_dict = {'id' : sample_data['id'].values[0],
                       'id_seqpos' : sample_data['id'].values[0] + '_' + str(i),
                       'sequence' : sample_data['sequence'].values[0][i],
                       'structure' : sample_data['structure'].values[0][i],
                       'predicted_loop_type' : sample_data['predicted_loop_type'].values[0][i],
                       'reactivity' : sample_data['reactivity'].values[0][i],
                       'reactivity_error' : sample_data['reactivity_error'].values[0][i],
                       'deg_Mg_pH10' : sample_data['deg_Mg_pH10'].values[0][i],
                       'deg_error_Mg_pH10' : sample_data['deg_error_Mg_pH10'].values[0][i],
                       'deg_pH10' : sample_data['deg_pH10'].values[0][i],
                       'deg_error_pH10' : sample_data['deg_error_pH10'].values[0][i],
                       'deg_Mg_50C' : sample_data['deg_Mg_50C'].values[0][i],
                       'deg_error_Mg_50C' : sample_data['deg_error_Mg_50C'].values[0][i],
                       'deg_50C' : sample_data['deg_50C'].values[0][i],
                       'deg_error_50C' : sample_data['deg_error_50C'].values[0][i]}
        
        
        shifts = [1,2,3,4,5]
        shift_cols = ['sequence', 'structure', 'predicted_loop_type']
        for shift,col in itertools.product(shifts, shift_cols):
            if i - shift >= 0:
                sample_dict['b'+str(shift)+'_'+col] = sample_data[col].values[0][i-shift]
            else:
                sample_dict['b'+str(shift)+'_'+col] = -1
            
            if i + shift <= sample_seq_length - 1:
                sample_dict['a'+str(shift)+'_'+col] = sample_data[col].values[0][i+shift]
            else:
                sample_dict['a'+str(shift)+'_'+col] = -1
        
        
        train_data.append(sample_dict)
train_data = pd.DataFrame(train_data)
train_data.head()

In [None]:
test_data = []
for mol_id in test['id'].unique():
    sample_data = test.loc[test['id'] == mol_id]
    sample_seq_length = sample_data.seq_length.values[0]
    for i in range(sample_seq_length):
        sample_dict = {'id' : sample_data['id'].values[0],
                       'id_seqpos' : sample_data['id'].values[0] + '_' + str(i),
                       'sequence' : sample_data['sequence'].values[0][i],
                       'structure' : sample_data['structure'].values[0][i],
                       'predicted_loop_type' : sample_data['predicted_loop_type'].values[0][i]}
        
        shifts = [1,2,3,4,5]
        shift_cols = ['sequence', 'structure', 'predicted_loop_type']
        for shift,col in itertools.product(shifts, shift_cols):
            if i - shift >= 0:
                sample_dict['b'+str(shift)+'_'+col] = sample_data[col].values[0][i-shift]
            else:
                sample_dict['b'+str(shift)+'_'+col] = -1
            
            if i + shift <= sample_seq_length - 1:
                sample_dict['a'+str(shift)+'_'+col] = sample_data[col].values[0][i+shift]
            else:
                sample_dict['a'+str(shift)+'_'+col] = -1
        
        test_data.append(sample_dict)
test_data = pd.DataFrame(test_data)
test_data.head()

In [None]:
# label_encoding
sequence_encmap = {'A': 0, 'G' : 1, 'C' : 2, 'U' : 3}
structure_encmap = {'.' : 0, '(' : 1, ')' : 2}
looptype_encmap = {'S':0, 'E':1, 'H':2, 'I':3, 'X':4, 'M':5, 'B':6}

enc_targets = ['sequence', 'structure', 'predicted_loop_type']
enc_maps = [sequence_encmap, structure_encmap, looptype_encmap]

for t,m in zip(enc_targets, enc_maps):
    for c in [c for c in train_data.columns if t in c]:
        train_data[c] = train_data[c].replace(m)
        test_data[c] = test_data[c].replace(m)

In [None]:
not_use_cols = ['id', 'id_seqpos']
features = [c for c in test_data.columns if c not in not_use_cols]
targets = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [None]:
cat_names = features
cont_names = []

In [None]:
# cat_names = ['cp_type', 'cp_time', 'cp_dose']
# cont_names = [c for c in train_x.columns if c not in cat_names and c != 'sig_id']
# y_names = [c for c in train_y.columns if c != 'sig_id']

In [None]:
y_names = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train_data))
procs = [Categorify, Normalize]
tab_pan = TabularPandas(train_data, procs=procs, cat_names=cat_names,
                        cont_names=cont_names, y_names=y_names, splits=splits)

In [None]:
dls = tab_pan.dataloaders(bs=4096)

In [None]:
learn = tabular_learner(dls, layers=[1024, 512, 512, 256], loss_func=MSELossFlat())

In [None]:
learn.lr_find()

In [None]:
lr = 3e-3
learn.fit_one_cycle(10, slice(lr/(2.6**4),lr))#, moms=(0.8,0.7))

In [None]:
# lr = 1e-3
# learn.fit_one_cycle(5, slice(lr/(2.6**4),lr))
# learn.unfreeze()

In [None]:
# lr = 1e-4
# learn.fit_one_cycle(10, slice(lr/(2.6**4),lr))

In [None]:

# learn.fit_one_cycle(20, slice(1e-5, 1e-4))

In [None]:
# learn.fit_one_cycle(25, slice(1e-5, 1e-4))

In [None]:
# learn.fit_one_cycle(30, slice(5e-4, 5e-3))

In [None]:
test_dl = learn.dls.test_dl(test_data)
sub = learn.get_preds(dl=test_dl)

In [None]:
sub[0].shape

In [None]:
submission.head()

In [None]:
submission[[c for c in submission.columns if c != 'id_seqpos']] = sub[0]

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)