In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from collections import OrderedDict

In [2]:
data_path = Path('data')

!ls $data_path

aan0693_Code-for-fitting-EC50-and-unfolded-state-model.bundle
aan0693_designed-PDB-files
aan0693_designed-PDB-files.gz
aan0693_SI_datasets
aan0693_SI_datasets.tar.gz
stability_test.csv
stability_train.csv
stability_valid.csv


In [49]:
def open_sets(base_path):
    sets = {}

    for path in base_path.glob('*.csv'):
        fname = path.stem
        kind = fname.split('_')[1]

        df = pd.read_csv(path)
        sets[kind] = df
        
    return sets

sets = open_sets(data_path)
sets['train']

Unnamed: 0,sequence,consensus_stability_score
0,GSSQETIEVEDEEEARRVAKELRKKGYEVKDERRGNKWHVHRT,0.37
1,TLDEARELVERAKKEGTGMDVNGQRFEDWREAERWVREQEKNK,0.62
2,TELKKKLEEALKKGEEVRVKFNGIEIRNTSEDAARKAVELLEK,-0.03
3,GSSQETIEVEDEEEARRVAKELRKTGYEVKIERRGNKWHVHRT,1.41
4,TTIHVGDLTLKYDNPKKAYEIAKKLAKKYNLQVTIKNGKITVT,1.11
...,...,...
7705,GSSKTQYEYDTKEEHQKAYEKFKKQGIPVTITQKNGKWFVQVE,0.80
7706,TIDEIIKALEQAVKDNKPIQVGNYTVTSADEAEKLAKKLKKPY,0.82
7707,TQDEIIKALEQAVKDNKPIQVGNYTVTSADEAEKLAKKLKKEY,0.66
7708,TTIKVNGQEYTVPLSPEQAAKAAKKRWPDYEVQIHGNTVWVTR,1.05


## Data Duplication

In [4]:
len(sets['train']['sequence'].unique()) != len(sets['train'])

False

## Alignments

In [19]:
from Bio import pairwise2

alignment = pairwise2.align.globalxx(sets['train']['sequence'][0], sets['train']['sequence'][1])
print(pairwise2.format_alignment(*alignment[0]))

GSSQETIEVE-DEEEARRVAKEL--R-KKGYEVK----D------E--R---RGNKWHVHRT------
     |     |  || |   ||  | ||  |      |      |  |   |   | | |       
-----T----LD--EA-R---ELVERAKK--E--GTGMDVNGQRFEDWREAER---W-V-R-EQEKNK
  Score=18



## Score per metrics

* More unique values => more stability?
* Does sequence lenght matter?


In [5]:
def append_to_key(d, k, v):
    if k in d:
        values = d[k]
        values.append(v)
        d[k] = values
        
    else:
        d[k] = [v]
        
def compute_mean_scores(d):
    new_d = dict()
    for k in d:
        mean_score = np.mean(d[k])
        new_d[k] = (mean_score, len(d[k]))
        
    return  OrderedDict(sorted(new_d.items()))

In [7]:
score_per_length = dict()
score_per_uniqueness = dict()

for idx, (seq, score) in sets['train'].iterrows():
    # score_per_length
    size = len(seq)
    append_to_key(score_per_length, size, score)
    
    
    # score_per_uniqueness
    uniques = set(seq)
    uniq_size = len(uniques)
    
    append_to_key(score_per_uniqueness, uniq_size, score)
    
    # count unique aminoacids
    for amino in uniques:
        uniques.add(amino)


In [8]:
compute_mean_scores(score_per_length)

OrderedDict([(43, (1.0939332224993072, 7218)),
             (46, (0.5659959349593496, 492))])

In [9]:
compute_mean_scores(score_per_uniqueness)

OrderedDict([(11, (1.1877272727272727, 22)),
             (12, (1.1719895287958115, 382)),
             (13, (1.1024068598462446, 1691)),
             (14, (1.04685173089484, 1531)),
             (15, (1.1661388550548113, 1642)),
             (16, (1.1496199999999999, 1000)),
             (17, (0.7856092436974791, 476)),
             (18, (0.8948214285714284, 168)),
             (19, (0.8083333333333332, 798))])

## By Score

In [41]:
sorted_train = sets['train'].sort_values(by='consensus_stability_score').reset_index()

In [48]:
alignment = pairwise2.align.globalxx(sorted_train['sequence'][0], sorted_train['sequence'][1])
print(pairwise2.format_alignment(*alignment[0]))

sorted_train.head(20)

RKWEE--IA-----ER----LREEFNIN-PEE-AREAVEKAGGNEEEAR--R-P---VK-------K--RL
       |      |     |        | | |  |  ||      |   | |   |        |  | 
-----TTI-KVNGQE-YTVPL-------SP-EQA--A--KA------A-KKRWPDYEV-QIHGNTYKVTR-
  Score=15



Unnamed: 0,index,sequence,consensus_stability_score
0,5537,RKWEEIAERLREEFNINPEEAREAVEKAGGNEEEARRPVKKRL,-0.26
1,122,TTIKVNGQEYTVPLSPEQAAKAAKKRWPDYEVQIHGNTYKVTR,-0.13
2,7529,TELKKKREEALKKGEEVRVKFNGIEIRITSEDAARKAVELLEK,-0.12
3,2520,GMADEEKLPPGWESRMSRSSGRVYYFNHITNASQWERPSGGSS,-0.11
4,2471,GSSGSLSDEDFKAVFGMTRSAFANLPLWKQQNPKKEKGLFGSS,-0.1
5,6493,GSSGSLSDEDFKAVGGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.1
6,4036,GSSGSLSDEDFKAVDGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.1
7,643,TELKKKLEEALKKGEEVRVKFNGIEIRDTSEDAARKAVELLEK,-0.1
8,2442,GSSGSLSDEDFKAVEGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.09
9,5356,GSSGSLSDEDFKAVFPMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.09


In [47]:
alignment = pairwise2.align.globalxx(
    sorted_train['sequence'][len(sorted_train)-1], 
    sorted_train['sequence'][len(sorted_train)-2]
)

print(pairwise2.format_alignment(*alignment[0]))

sorted_train.tail(20)

GSSET-RYRFTDEEE-ARRAAKEWARRGYQVHVTQNGTYWEVEVR
||| | |||||| || |||||||||||||||||||||||||||||
GSS-TTRYRFTD-EEKARRAAKEWARRGYQVHVTQNGTYWEVEVR
  Score=41



Unnamed: 0,index,sequence,consensus_stability_score
7690,1367,ERRKIEEIAKKLYQSGNPEAARRFLRKRGISEEEIERILQKAG,1.86
7691,7209,ERRKIEEIAWKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.87
7692,3917,GSSKTQYEYDTKEEAQKAYFKFKKQGIPVTITQKNGKWFVQVE,1.87
7693,401,GSSKTQYEYDTKEKAQKAYEKFKKQGIPVTITQKNGKWFVQVE,1.88
7694,2842,ERRKIEEIAFKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.88
7695,2199,ERRKIEEIAYKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.88
7696,6055,GSSKTQYEYDTKEEAQKAYEKFKKQGIMVTITQKNGKWFVQVE,1.88
7697,3094,GSSTTRYRFTDEEEARRYAKEWARRGYQVHVTQNGTYWEVEVR,1.9
7698,3512,GSSTTRYRFTDEEEARRWAKEWARRGYQVHVTQNGTYWEVEVR,1.9
7699,5293,GSSQETIEVEDEEEARRVAKELRKKGYQVKIERRGNKWHVHRT,1.93


* Diversity Maximizing: This is a greedy strategy which starts from the reference and adds the sequence with highest average hamming distance to current set of sequences.

* Diversity Minimizing: This strategy is equivalent to the Diversity Maximizing strategy, but adds the se- quence with lowest average hamming distance. It is used to explore the effects of diversity on model per- formance.

* HHFilter: This strategy applies hhfilter (Steinegger etal.,2019)withthe-diff Mparameter,whichre- turns M or more sequences that maximize diversity (the result is usually close to M). If more than M sequences are returned we apply the Diversity Maxi- mizing strategy on top of the output.