# Comparison between RNA-covid stability predictions

### Let's remaind some stats for the training set

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [None]:
test_competition = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)
test_competition.head()

In [None]:
train_competition = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)
train_competition.head()

### distribution of scores

In [None]:
reactivity = []
for n in train_competition['reactivity']:
    reactivity+=n
reactivity = pd.Series(reactivity)

reactivity.plot(kind='hist',bins=500)
plt.xlim(-0.2,5)
plt.show()

In [None]:
reactivity.to_frame().describe()

In [None]:
import pandas as pd
second_place = pd.read_csv('../input/covid-result-of-233-sequences/2nd-place-233-seq.csv')
second_place.set_index('id_seqpos',inplace=True)
second_place.columns = ['P2_'+n for n in second_place.columns]
second_place.sort_values(by='P2_reactivity').tail(5)

In [None]:
first_place = pd.read_csv('../input/ov-inference-233-new-seq/submission.csv')
first_place.set_index('id_seqpos',inplace=True)
first_place.columns = ['P1_'+n for n in first_place.columns]
first_place.sort_values(by='P1_reactivity').tail(5)

In [None]:
third_place = pd.read_csv('../input/inference-new-233-sequences/submission.csv')
third_place.set_index('id_seqpos',inplace=True)
third_place.columns = ['P3_'+n for n in third_place.columns]
third_place.sort_values(by='P3_reactivity').tail(5)

## Compare 1st, 2nd and 3rd place models on the new (233 sequences) dataset

In [None]:
merge = pd.concat([ first_place['P1_reactivity'],
                   second_place['P2_reactivity'],
                   third_place['P3_reactivity']],axis=1)

merge.plot(kind='hist',histtype='step',bins=50,figsize=(12,4),density=1)

reactivity[(reactivity>-0.1)&(reactivity<5)].plot(
    kind='hist',histtype='step',bins=50,density=1,label='Competition train')
plt.xlim(-0.2,2)
plt.legend()
plt.show()

In [None]:
import seaborn as sns
sns.pairplot(merge, markers='.',diag_kws={'bins':50},height=3)
plt.show()

In [None]:
sns.heatmap(merge.corr(),annot=True)
plt.yticks(rotation=360)
plt.show()

## let's look at the first 68 bases 
they are the positions trained in the origina dataset

In [None]:
merge['base']=[int(n.split('_')[-1]) for n in merge.index.values]
#only positions less than 68 where trained
sns.pairplot(merge[merge['base']<68].iloc[:,:-1], markers='.', diag_kws={'bins':50},height=3)
plt.show()

In [None]:
sns.heatmap(merge[merge['base']<68].iloc[:,:-1].corr(),annot=True)
plt.yticks(rotation=360)
plt.show()

### sampling a smaller number of positions after 68

In [None]:
sns.pairplot(merge[merge['base']>68].sample(
    n=merge[merge['base']<68].shape[0]).iloc[:,:-1], markers='.',diag_kws={'bins':50})
plt.show()

In [None]:
sns.heatmap(merge[merge['base']>68].sample(
    n=merge[merge['base']<68].shape[0]).iloc[:,:-1].corr(),annot=True)
plt.yticks(rotation=360)
plt.show()

In [None]:
merge['P1_filter']=['High' if n > 1.2 else 'Low' for n in merge['P1_reactivity']]

In [None]:
temp = merge[merge['base']>68]
sns.pairplot(temp[['P1_reactivity','P2_reactivity','P3_reactivity','P1_filter']],
             markers='.',hue='P1_filter')
plt.show()