In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
PATH = '../input/stanford-covid-vaccine'

train = pd.read_json(f'{PATH}/train.json',lines=True).drop(columns=['index'])
test = pd.read_json(f'{PATH}/test.json', lines=True).drop(columns=['index'])
submission = pd.read_csv(f'{PATH}/sample_submission.csv')

## Variables

#### Sequence Variables

* `sequence` - provides the nucleotide sequence
* `structure` - provides the pairing data where `(` and `)` refer to paired sequences at their respective indices while `.` means an unpaired sequence
* `predicted_loop_type` - describe the structural context of the sequence
  * S: paired "Stem" 
  * M: Multiloop 
  * I: Internal loop 
  * B: Bulge 
  * H: Hairpin loop 
  * E: dangling End 
  * X: eXternal loop
  
#### Evaluation

* The model will be predicting `reactivity`, `deg_Mg_pH10`, `deg_pH10`, `deg_Mg_50C`, and `deg_50C` for each nucleotide position in the mRNA
* However the model will only be evaluated on the first `seq_scored` nucleotides since the competition organizers use a next-generation sequencer that provides measurements for all samples in a single reaction, however "padding" nucleotides are used for demultiplexing (https://www.kaggle.com/c/stanford-covid-vaccine/discussion/181991)
* The *mean column-wise root mean squared error (MCRMSE)* is used

## Ideas

* From a structural perspective, it seems that each nucleotide's reactivity is dependent on its place in the overall structure
* Perhaps one of the things to ask is what is the overall stability of the molecule itself?
  * I would hypothesize that a more stable molecule is less likely to have individual nucleotides that are more reactive
* However we're tasked with finding the local stability as well
  * Likely this will be dependent on the following:
    * Nucleotide type - G/C tend to be more stable compared to A/T due to three vs two hydrogen bonds
    * Surrounding structure - anticipate that change points in surrounding structures will correlate to weaknesses

In [None]:
def get_structure_mean_value(row, col):
    r_d = {'S': [], 'M': [], 'I': [], 'B': [], 'H': [], 'E': [], 'X': []}
    for p, r in zip(row['predicted_loop_type'], row[col]):
        r_d[p].append(r)

    r_m = {}
    for k in r_d.keys():
        r_m[k] = np.mean(r_d[k])
    return r_m['S'], r_m['M'], r_m['I'], r_m['B'], r_m['H'], r_m['E'], r_m['X']

In [None]:
r_vals = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
e_vals = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C', 'deg_error_50C']

In [None]:
for col in r_vals:
    train[f'S_{col}'], train[f'M_{col}'], train[f'I_{col}'], train[f'B_{col}'], train[f'H_{col}'], train[f'E_{col}'], train[f'X_{col}'] = zip(*train.apply(lambda x: get_structure_mean_value(x, col), axis=1))

## EDA

In [None]:
def plot_loop_type_values(df, col, xlim):
    df[[f'S_{col}', f'M_{col}', f'I_{col}', f'B_{col}', f'H_{col}', f'E_{col}', f'X_{col}']].plot.kde(title=col, xlim=xlim)

Here we can see the reactivity by the Predicted Loop Type. Makes sense to think that the stems should be less reactive while the dangling ends are the most reactive.

In [None]:
for col in r_vals:
    plot_loop_type_values(train, col, xlim=[-2, 3])

## Dumb Model

Let's make a dumb model that only uses the averages of the overall `predicted_loop_type` values per predicted column.

In [None]:
loop_type = ['S', 'M', 'I', 'B', 'H', 'E', 'X']

In [None]:
all_mean_loop_vals = {}
sn_train = train[train['SN_filter']==1]
for col in r_vals:
    mean_loop_vals = {}
    for loop in loop_type:
        v = sn_train[f'{loop}_{col}']
        mean_loop_vals[loop] = np.nanmean(v.values)
    all_mean_loop_vals[col] = mean_loop_vals

## Dumb Model CV

In [None]:
# Only use the ones that qualify according the Signal to Noise Filter
cv_train = train[train['SN_filter']==1]

In [None]:
cv_train = cv_train[['id', 'predicted_loop_type'] + r_vals]

In [None]:
cv_out = {}
for col in r_vals:
    cv_out[col] = np.array([np.array(x) for x in cv_train[col].values])

In [None]:
# Get the predicted values according to the dumb model
cv_preds = {}
for col in r_vals:
    data = []
    for i, loop in enumerate(cv_train['predicted_loop_type']):
        vals = np.zeros(len(loop))
        for j, nt in enumerate(loop):
            vals[j] = all_mean_loop_vals[col][nt]
        data.append(vals[:68])
    cv_preds[col] = np.array(data)

In [None]:
def mcrmse(y_grd, y_hat):
    r_vals = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
    cv_score = []
    for col in r_vals:
        cv_score.append(np.sqrt(np.mean(np.square(y_grd[col] - y_hat[col]))))
    return np.mean(cv_score)

## Evaluation

Interestingly, this is scoring better than some of the other XGBoost and LightGBM models out there!

In [None]:
mcrmse(cv_out, cv_preds)

In [None]:
all_rows = []
for j,r in test[['id', 'predicted_loop_type']].iterrows():
    for i, loop in enumerate(r['predicted_loop_type']):
        #print(loop)
        row = {}
        row['id_seqpos'] = f'{r["id"]}_{i}'
        for col in r_vals:
            row[col] = all_mean_loop_vals[col][loop]
        all_rows.append(row)

In [None]:
sub = pd.DataFrame(all_rows)

In [None]:
sub.to_csv('submission.csv', index=False)