In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from collections import OrderedDict

In [2]:
data_path = Path('data')

!ls $data_path

stability_embeddings_test.pkl	stability_test.csv   stability_train.h5
stability_embeddings_train.pkl	stability_test.h5
stability_embeddings_val.pkl	stability_train.csv


In [3]:
def open_sets(base_path):
    sets = {}

    for path in base_path.glob('*.csv'):
        fname = path.stem
        kind = fname.split('_')[1]

        df = pd.read_csv(path)
        sets[kind] = df
        
    return sets

sets = open_sets(data_path)
sets['train']

Unnamed: 0,sequence,consensus_stability_score
0,GSSQETIEVEDEEEARRVAKELRKKGYEVKDERRGNKWHVHRT,0.37
1,TLDEARELVERAKKEGTGMDVNGQRFEDWREAERWVREQEKNK,0.62
2,TELKKKLEEALKKGEEVRVKFNGIEIRNTSEDAARKAVELLEK,-0.03
3,GSSQETIEVEDEEEARRVAKELRKTGYEVKIERRGNKWHVHRT,1.41
4,TTIHVGDLTLKYDNPKKAYEIAKKLAKKYNLQVTIKNGKITVT,1.11
...,...,...
7705,GSSKTQYEYDTKEEHQKAYEKFKKQGIPVTITQKNGKWFVQVE,0.80
7706,TIDEIIKALEQAVKDNKPIQVGNYTVTSADEAEKLAKKLKKPY,0.82
7707,TQDEIIKALEQAVKDNKPIQVGNYTVTSADEAEKLAKKLKKEY,0.66
7708,TTIKVNGQEYTVPLSPEQAAKAAKKRWPDYEVQIHGNTVWVTR,1.05


## Data Duplication

In [4]:
len(sets['train']['sequence'].unique()) != len(sets['train'])

False

## Alignments

In [5]:
from Bio import pairwise2

alignment = pairwise2.align.globalxx(sets['train']['sequence'][0], sets['train']['sequence'][1])
print(pairwise2.format_alignment(*alignment[0]))

GSSQETIEVE-DEEEARRVAKEL--R-KKGYEVK----D------E--R---RGNKWHVHRT------
     |     |  || |   ||  | ||  |      |      |  |   |   | | |       
-----T----LD--EA-R---ELVERAKK--E--GTGMDVNGQRFEDWREAER---W-V-R-EQEKNK
  Score=18



## Score per metrics

* More unique values => more stability?
* Does sequence lenght matter?


In [6]:
def append_to_key(d, k, v):
    if k in d:
        values = d[k]
        values.append(v)
        d[k] = values
        
    else:
        d[k] = [v]
        
def compute_mean_scores(d):
    new_d = dict()
    for k in d:
        mean_score = np.mean(d[k])
        new_d[k] = (mean_score, len(d[k]))
        
    return  OrderedDict(sorted(new_d.items()))

In [7]:
score_per_length = dict()
score_per_uniqueness = dict()

for idx, (seq, score) in sets['train'].iterrows():
    # score_per_length
    size = len(seq)
    append_to_key(score_per_length, size, score)
    
    
    # score_per_uniqueness
    uniques = set(seq)
    uniq_size = len(uniques)
    
    append_to_key(score_per_uniqueness, uniq_size, score)
    
    # count unique aminoacids
    for amino in uniques:
        uniques.add(amino)


In [8]:
compute_mean_scores(score_per_length)

OrderedDict([(43, (1.0939332224993072, 7218)),
             (46, (0.5659959349593496, 492))])

In [9]:
compute_mean_scores(score_per_uniqueness)

OrderedDict([(11, (1.1877272727272727, 22)),
             (12, (1.1719895287958115, 382)),
             (13, (1.1024068598462446, 1691)),
             (14, (1.04685173089484, 1531)),
             (15, (1.1661388550548113, 1642)),
             (16, (1.1496199999999999, 1000)),
             (17, (0.7856092436974791, 476)),
             (18, (0.8948214285714284, 168)),
             (19, (0.8083333333333332, 798))])

## By Score

In [10]:
sorted_train = sets['train'].sort_values(by='consensus_stability_score').reset_index()

In [11]:
alignment = pairwise2.align.globalxx(sorted_train['sequence'][0], sorted_train['sequence'][1])
print(pairwise2.format_alignment(*alignment[0]))

sorted_train.head(20)

RKWEE--IA-----ER----LREEFNIN-PEE-AREAVEKAGGNEEEAR--R-P---VK-------K--RL
       |      |     |        | | |  |  ||      |   | |   |        |  | 
-----TTI-KVNGQE-YTVPL-------SP-EQA--A--KA------A-KKRWPDYEV-QIHGNTYKVTR-
  Score=15



Unnamed: 0,index,sequence,consensus_stability_score
0,5537,RKWEEIAERLREEFNINPEEAREAVEKAGGNEEEARRPVKKRL,-0.26
1,122,TTIKVNGQEYTVPLSPEQAAKAAKKRWPDYEVQIHGNTYKVTR,-0.13
2,7529,TELKKKREEALKKGEEVRVKFNGIEIRITSEDAARKAVELLEK,-0.12
3,2520,GMADEEKLPPGWESRMSRSSGRVYYFNHITNASQWERPSGGSS,-0.11
4,2471,GSSGSLSDEDFKAVFGMTRSAFANLPLWKQQNPKKEKGLFGSS,-0.1
5,6493,GSSGSLSDEDFKAVGGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.1
6,4036,GSSGSLSDEDFKAVDGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.1
7,643,TELKKKLEEALKKGEEVRVKFNGIEIRDTSEDAARKAVELLEK,-0.1
8,2442,GSSGSLSDEDFKAVEGMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.09
9,5356,GSSGSLSDEDFKAVFPMTRSAFANLPLWKQQNLKKEKGLFGSS,-0.09


In [12]:
alignment = pairwise2.align.globalxx(
    sorted_train['sequence'][len(sorted_train)-1], 
    sorted_train['sequence'][len(sorted_train)-2]
)

print(pairwise2.format_alignment(*alignment[0]))

sorted_train.tail(20)

GSSET-RYRFTDEEE-ARRAAKEWARRGYQVHVTQNGTYWEVEVR
||| | |||||| || |||||||||||||||||||||||||||||
GSS-TTRYRFTD-EEKARRAAKEWARRGYQVHVTQNGTYWEVEVR
  Score=41



Unnamed: 0,index,sequence,consensus_stability_score
7690,1367,ERRKIEEIAKKLYQSGNPEAARRFLRKRGISEEEIERILQKAG,1.86
7691,7209,ERRKIEEIAWKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.87
7692,3917,GSSKTQYEYDTKEEAQKAYFKFKKQGIPVTITQKNGKWFVQVE,1.87
7693,401,GSSKTQYEYDTKEKAQKAYEKFKKQGIPVTITQKNGKWFVQVE,1.88
7694,2842,ERRKIEEIAFKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.88
7695,2199,ERRKIEEIAYKLYQSGNPEAARRFLRKAGISEEEIERILQKAG,1.88
7696,6055,GSSKTQYEYDTKEEAQKAYEKFKKQGIMVTITQKNGKWFVQVE,1.88
7697,3094,GSSTTRYRFTDEEEARRYAKEWARRGYQVHVTQNGTYWEVEVR,1.9
7698,3512,GSSTTRYRFTDEEEARRWAKEWARRGYQVHVTQNGTYWEVEVR,1.9
7699,5293,GSSQETIEVEDEEEARRVAKELRKKGYQVKIERRGNKWHVHRT,1.93


* Diversity Maximizing: This is a greedy strategy which starts from the reference and adds the sequence with highest average hamming distance to current set of sequences.

* Diversity Minimizing: This strategy is equivalent to the Diversity Maximizing strategy, but adds the se- quence with lowest average hamming distance. It is used to explore the effects of diversity on model per- formance.

* HHFilter: This strategy applies hhfilter (Steinegger etal.,2019)withthe-diff Mparameter,whichre- turns M or more sequences that maximize diversity (the result is usually close to M). If more than M sequences are returned we apply the Diversity Maxi- mizing strategy on top of the output.

In [13]:
alignment[0].score

41.0

In [152]:
from scipy.spatial import distance

import random
import numpy as np
from numba import jit

from tqdm.autonotebook import tqdm

def compute_diversity(seq1, seq2, method = "hamming"):
    if method == "alignment":
        method = pairwise2.align.globalxx
        diversity = method(seq1, seq2)
        diversity = diversity[0].score / max(len(seq1), len(seq2))
        
    elif method == "hamming":
        if len(seq1) != len(seq2):
            return np.nan
        
        method = distance.hamming
        diversity = method(list(seq1), list(seq2))
        
    return diversity


def dataset_diversity(sequences, method = "hamming", reduce = "mean", verbose = True):
    if reduce != "mean":
        raise NotImplementedError
    else:
        reducer = np.nanmean
    
    reduced_divs = []
    if verbose: 
        pbar = tqdm(total = len(sequences), miniters = 1, smoothing = 1)

    for seq_idx, seq in enumerate(sequences):
        other_sequences = np.concatenate((sequences[:seq_idx] , sequences[seq_idx + 1:] ))

        v_diversity = np.vectorize(lambda x: compute_diversity(seq, x))
        if len(other_sequences) > 1:
            div_vs_all = v_diversity(other_sequences)  
        else:
            continue
        
        reduced_div_vs_all = reducer(div_vs_all) if len(div_vs_all) >= 1 else np.nan

        reduced_divs.append(div_vs_all)
        
        if verbose:
            pbar.update(1)
            pbar.refresh()
    
    if verbose:
        pbar.close()
        
    return np.array(reduced_divs)

def sample_by_diversity(sequences, size, min_score = 0.7, args = {"verbose" : False}):
    check_diversity = lambda ds : np.nanmean(dataset_diversity(np.array(list(ds)), **args)) >= min_score
    
    sampled_seqs = set()
    max_iters = 10
    current_iters = 0
    while len(sampled_seqs) < size:
        idx_sample = random.sample(range(len(sequences)), 1)
        sample = sequences[idx_sample]
        
        is_empty = len(sampled_seqs) == 0 or len(sampled_seqs) == 1
        
        if is_empty or check_diversity(sampled_seqs.union({sample[0]})):
            sampled_seqs.add(sample[0])
            sequences = np.delete(sequences, idx_sample)
            if len(sampled_seqs) % (size // 10) == 0:
                print(len(sampled_seqs))
            
        else: 
            continue
    # check diversity
    if not check_diversity(sampled_seqs):
        raise ValueError("Sampled diversity error")
        
    return sampled_seqs

def select_by_diversity(sequences, min_size, min_score = 0.7, args = {"verbose" : False}):
    chosen_seqs = []
    
    check_diversity = lambda ds : np.nanmean(dataset_diversity(np.array(list(ds)), **args)) >= min_score
    around_size = lambda seqs: len(seqs) * 1.2 >= min_size
    is_empty = len(chosen_seqs) == 0 or len(chosen_seqs) == 1
    

    with tqdm(total = min_size, miniters = 1, smoothing = 1) as pbar:
        last_diversity = False
        while len(chosen_seqs) <= min_size:
            max_iters = 10
            current_iters = 0

            seq_set = set(chosen_seqs).union({sequences[0]}) 

            if is_empty or check_diversity(seq_set):
                chosen_seqs.append(sequences[0])
                sequences = np.delete(sequences, 0)

                last_diversity = check_diversity(chosen_seqs)

                current_iters = 0
                pbar.update(1)
                pbar.refresh()

            elif current_iters >= max_iters:
                break
            else:
                current_iters += 1

            if around_size(chosen_seqs) and last_diversity:
                break
            
    return chosen_seqs

In [153]:
# ds_div_train = dataset_diversity(sets["train"]["sequence"])
# sets["train"]["diversity"] = np.nanmean(np.array(ds_div_train), 1)
# np.nanmean(np.array(ds_div_train))
# 0.8xxx

In [154]:
# ds_div_test = dataset_diversity(sets["test"]["sequence"])
# sets["test"]["diversity"] = np.nanmean(np.array(ds_div_test), 1)
# np.nanmean(np.array(ds_div_test))

In [43]:
sets["train"].to_csv(f"{data_path}/stability_diversity_train.csv")
sets["test"].to_csv(f"{data_path}/stability_diversity_test.csv")

In [53]:
sorted_div = {
    "train" : sets["train"].sort_values(by="diversity", axis=0, ascending=False),
    "test"  : sets["test"].sort_values(by="diversity", axis=0, ascending=False),
}

In [None]:
for s in [450, 700, 1000]:
    for kind in ["train"]:
        chosen_by_diversity = select_by_diversity(
            sorted_div[kind]["sequence"].values, 
            min_size=s, 
            min_score=0.65
        )
    
    subset = sets["train"][sets["train"]["sequence"].isin(chosen_by_diversity)]
    subset.to_csv(f"{data_path}/stability_diversity_train_{s}.csv")

  0%|          | 0/450 [00:00<?, ?it/s]

  check_diversity = lambda ds : np.nanmean(dataset_diversity(np.array(list(ds)), **args)) >= min_score


  0%|          | 0/700 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]