In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2

from pathlib import Path

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
DATA_DIR = Path("../input/stanford-covid-vaccine/")
BPPS_DIR = DATA_DIR / "bpps"

train = pd.read_json(DATA_DIR / "train.json", lines=True)
test = pd.read_json(DATA_DIR / "test.json", lines=True)

bppm_paths = list(BPPS_DIR.glob("*.npy"))
bppm_dict = {}
for p in bppm_paths:
    bppm_dict[p.name[:-4]] = p
    
    
figsize = (15, 6)

In [None]:
sn_filter_mask = train['SN_filter'] == 1
train[sn_filter_mask]['signal_to_noise'].hist()

As given in the [data](https://www.kaggle.com/c/stanford-covid-vaccine/data) section:

> \*_error_\* - An array of floating point numbers, should have the same length as the corresponding reactivity or deg_* columns, calculated errors in experimental values obtained in reactivity and deg_* columns.

In the [forums](https://www.kaggle.com/c/stanford-covid-vaccine/data), one of the [hosts](https://www.kaggle.com/dosoon) gave some clarifications about what exactly these values are:

> [ the error columns have the same units as the reactivity or deg* columns--they are positive values describing the possible measurement error associated with the value at the same index in the corresponding deg* column. Note that I don't mean that they should be purely added, but that they are an indicator of confidence in these measurements. They are based on Poisson statistics (more details of how these values are calculated here: https://daslab.stanford.edu/site_data/pub_pdf/2014_Seetin_MIMB.pdf)

> Large errors can signify multiple things, but most likely that at that given position, there probably were not enough sequencing reads to give a very confident measurement. ]


Let's have a look at those columns!

Since these values are of the same length as `seq_scored`, ie, 68 in training_set & public_test and 91 in private_test, it's better to take a mean to keep things managable.

In [None]:
train['reactivity_error_mean'] = train['reactivity_error'].apply(np.mean)
train['deg_error_Mg_50C_mean'] = train['deg_error_Mg_50C'].apply(np.mean)
train['deg_error_Mg_pH10_mean'] = train['deg_error_Mg_pH10'].apply(np.mean)

train[[
    'reactivity_error_mean',
    'deg_error_Mg_50C_mean',
    'deg_error_Mg_pH10_mean'
]].plot(figsize=figsize)

Quite a lot of spikes and very large ones too! Probably that's why they gave a `SN_filter`. 

It might be interesting to see how many do not pass the QA by hosts.

In [None]:
train[~sn_filter_mask][[
    'reactivity_error_mean',
    'deg_error_Mg_50C_mean',
    'deg_error_Mg_pH10_mean'
]].plot(figsize=figsize, title='Those that failed QA')

train[sn_filter_mask][[
    'reactivity_error_mean', 
    'deg_error_Mg_50C_mean',
    'deg_error_Mg_pH10_mean'
]].plot(figsize=figsize, ylim=(0, 175000), title='Those that passed QA')

So the readings that passed the QA have very low errors. What a finding! :D 

Let's see how things look where `reactivity` error is max.

In [None]:
idx = train['reactivity_error_mean'].argmax()
print(idx)
row = train.iloc[idx, :]

The Base-Pair Probablity says that this particular molecule has a high tendency to fold? 
I am not very sure about this, probably a domain expert could better tell us.

In [None]:
bppm = np.load(bppm_dict[row['id']])
plt.figure(figsize=(8, 6))
plt.imshow(bppm, cmap='jet')
plt.colorbar()
bppm.mean()

Now lets see how the actual values look like.

In [None]:
plt.figure(figsize=figsize)
plt.title('Readings')
plt.plot(row['reactivity'])
plt.plot(row['deg_Mg_50C'])
plt.plot(row['deg_Mg_pH10'])

plt.figure(figsize=figsize)
plt.title('Errors')
plt.plot(row['reactivity_error'])
plt.plot(row['deg_error_Mg_50C'])
plt.plot(row['deg_error_Mg_pH10'])

Let's see the row which has the minimum error

In [None]:
idx = train['reactivity_error_mean'].argmin()
print(idx)
row = train.iloc[idx, :] 
      
plt.figure(figsize=figsize)
plt.title('Readings')
plt.plot(row['reactivity'])
plt.plot(row['deg_Mg_50C'])
plt.plot(row['deg_Mg_pH10'])

plt.figure(figsize=figsize)
plt.title('Errors')
plt.plot(row['reactivity_error'])
plt.plot(row['deg_error_Mg_50C'])
plt.plot(row['deg_error_Mg_pH10'])

So there's practically no reading where there's a lot of error!


How do you kill what cannot die? How do you predict what has no inputs?

Finally let's a quick check to see if BPPs are also affected by it.

In [None]:
%%time

train['bppm_mean'] = train['id'].apply(lambda x: np.load(bppm_dict[x]).mean())
train[sn_filter_mask]['bppm_mean'].plot(figsize=figsize, label='Passed QA')
train[~sn_filter_mask]['bppm_mean'].plot(figsize=figsize, label='Failed QA')
plt.legend()

In [None]:
train[sn_filter_mask]['bppm_mean'].mean(), train[~sn_filter_mask]['bppm_mean'].mean()

Though not always true, the BPPs are a bit on the down side for those readings that didn't pass QA.