# Analyze evotuned models

Examine the sequences sampled by evotuned models. Do they apprear antibody-like? How similar/dissimilar are they from the training data?

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from unirep import babbler1900

In [2]:
# available weight dirs
BATCH_SZ = 256
VH_VL_WEIGHTS = "vh_vl_weights"
VH_WEIGHTS = "vh_weights"
VL_WEIGHTS = "vl_weights"
SAMPLES_DIR = "samples"

In [3]:
def generate_seqs(model, n=10, temp=1.0, max_len=300) -> list:
    seqs = []
    for _ in range(n):
        seqs.append(model.get_babble("", length=max_len, temp=temp))
    return seqs


In [4]:
def to_fasta(seqs:list, file:str, name_prefix:str):
    with open(file, "w") as fasta:
        for i, s in enumerate(seqs):
            fasta.write(">{}{}\n{}\n".format(name_prefix, str(i+1), s))


## VH model

Sample seqs from the VH model using a range of temperatures and examine the resulting sequences

In [5]:
vh_model = babbler1900(model_path=VH_WEIGHTS, batch_size=BATCH_SZ)

  from ._conv import register_converters as _register_converters


In [6]:
vh_seqs_ht = generate_seqs(vh_model, n=10, temp=1.0)

In [7]:
vh_seqs_lt = generate_seqs(vh_model, n=10, temp=0.5)

In [8]:
to_fasta(vh_seqs_ht + vh_seqs_lt, "samples/vh_samples.fasta", name_prefix="VH")

In [9]:
# clear varibles from tf graph -> needed for loading a new model
tf.reset_default_graph()

## VL model

Sample seqs from the VL model using a range of temperatures and examine the resulting sequences

In [10]:
vl_model = babbler1900(model_path=VL_WEIGHTS, batch_size=BATCH_SZ)

In [11]:
vl_seqs_ht = generate_seqs(vl_model, n=10, temp=1.0)

In [12]:
vl_seqs_lt = generate_seqs(vl_model, n=10, temp=0.5)

In [13]:
to_fasta(vl_seqs_ht + vl_seqs_lt, "samples/vl_samples.fasta", name_prefix="VL")

In [14]:
# clear varibles from tf graph -> needed for loading a new model
tf.reset_default_graph()

## Create pairs for batch homology modelling

Input for Schrodinger AB structure prediction.

In [15]:
vh_seqs = vh_seqs_ht + vh_seqs_lt
vl_seqs = vl_seqs_ht + vl_seqs_lt
vh_vl_df = pd.DataFrame({
    "Name":["sample_{}".format(i+1) for i in range(len(vh_seqs))],
    "VH": vh_seqs,
    "VL": vl_seqs
})
vh_vl_df

Unnamed: 0,Name,VH,VL
0,sample_1,EVQLVETGGGLIQPGGSLTLSCAASGFTVSNNYMSWGRQAPGKGLE...,QSVLTQPRSVSGAPGQRVTISCTGSNSNIGAGYDVHWYQQLPGRAP...
1,sample_2,QVQLVQSGAEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,QSALTQPRSVSGSTGQSVTIYCSGSRSDVGGYKYVSWYQQHPGKAP...
2,sample_3,EVQLVESAGGLVKPGGSLRLSCRASGFPVSSYGMSWVRQAPGKGPE...,SGQTQPLSVSGSPGPRTTISCTGTSNDVGGNSDVSWYQQRPGTAPK...
3,sample_4,QVQLVESGGGLVKPGRSLRLSCAASGFTFSQYSMSWVRQAPGKGLE...,QLVLTQSPSASASLGASVKLTCTLNSEHGSYPIAWHQRQTEKGPRY...
4,sample_5,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGRGLE...,QTVVTQEPSFSVSPGGTVTLTCGSRSGAVTAGHYPAWFQQKPGQAP...
5,sample_6,EVQLLESGGDLVQPGGSLRLSCAASGFTFSIFAMTWVRQAPGKGLE...,QSVLTPPPSVSGAQGQKLTIPCTRSSSNIGAGYDVHWYQQLPGTAP...
6,sample_7,QLQLQESGPGLVKPSETLSLTCTVSGGSLSRSSDYWAWSRPPPKKG...,QSVLAQPPSASGTPGQRVTIPCSGSSSNIRSNAVNWYQQLPGTAPK...
7,sample_8,EVQLVESGGGLVKPGGSLRLSCEASGFTFSSYSMNWVRQAPGKGLE...,QSALTQPASVSGSPGQSITISCTGSLSDVGTYYGKWLPQTPGQAHH...
8,sample_9,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSYNMNWVRQAPGKGLE...,SVLTQPPSVSGAPGQRVTIPCSGSRSNIGAGYDVHWYQQLPGTAPK...
9,sample_10,EVQLLESGGGLAQPGGSLKVSCTASGFTFRSTAMSWVRQAPGQGLE...,QSVLTQPPSASGTPGQRVTIPCSGTSSDIGSNYVYWYQHLPGTAPK...


In [16]:
vh_vl_df.to_csv("samples/vh_vl_paired.csv", index=False)

## VH.VL paired model

Need to split generated sequences into VH/VL parts -> may be very difficult.

In [17]:
vh_vl_model = babbler1900(model_path=VH_VL_WEIGHTS, batch_size=BATCH_SZ)

In [18]:
vh_vl_seqs_ht = generate_seqs(vh_vl_model, n=10, temp=1.0)

In [19]:
vh_vl_seqs_lt = generate_seqs(vh_vl_model, n=10, temp=0.5)

In [21]:
to_fasta(vh_vl_seqs_ht + vh_vl_seqs_lt, "samples/vh_vl_samples.fasta", name_prefix="VH_VL")

Performing MSA on the sampled VH.VL sequences shows that they separate fairly well and CDR:s can be annotated by Schrodinger.