In [1]:
# Checking inferred models

# Genomic templates

## Load initial fasta files

In [2]:
import pygor3 as p3
import numpy as np

In [3]:
# W -> A or T
# Y -> C or T

In [4]:
df_V_ref_genome = p3.utils.get_dataframe_from_fasta(fln_fasta='Killifish_genome_template/v_no_imgt.fasta')
df_D_ref_genome = p3.utils.get_dataframe_from_fasta(fln_fasta='Killifish_genome_template/d_no_imgt.fasta')
df_J_ref_genome = p3.utils.get_dataframe_from_fasta(fln_fasta='Killifish_genome_template/j_no_imgt.fasta')

In [5]:
df_V_ref_genome

Unnamed: 0_level_0,name,value
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
8,IGHV2-03*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...
9,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...


## Modify genomic templates

### V genes

In [6]:
df_V_ref_genome['len'] = df_V_ref_genome['value'].apply(lambda x: len(x))
df_V_ref_genome['num_no_ACGT'] = df_V_ref_genome['value'].apply(lambda x: len(x) - x.count('A') - x.count('C') - x.count('G') - x.count('T'))
df_V_ref_genome['family'] = df_V_ref_genome['name'].apply(lambda x: x.split('-')[0])
df_V_ref_genome

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,289,0,IGHV1
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,0,IGHV2
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,275,0,IGHV1
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,301,0,IGHV3
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,298,0,IGHV2
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,272,0,IGHV4
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
8,IGHV2-03*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,1,IGHV2
9,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,289,0,IGHV1


In [7]:
df_V_ref_genome.loc[8]['name']

'IGHV2-03*01'

In [8]:
df_V_ref_genome.loc[10]['name']

'IGHV5-01*01'

There are 2 gene templates with no deined nucleotides genes with id: 8 and 10.
I need to check bifurcate the 2 genes:

- id:8, "IGHV2-03*01"
- id:10, "IGHV5-01*01"

Create a new nomenclature that is

- 24: "IGHV2-03*01A"  for Y to T
- 25: "IGHV2-03*01AA" for Y to C
- 26: "IGHV5-01*01A"  for W to T
- 27: "IGHV5-01*01AA"

so:
- 8 to 24 and 25
- 10 to 26 and 27

In [9]:
df_V_ref_genome[df_V_ref_genome['family'] == 'IGHV2']

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,0,IGHV2
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,298,0,IGHV2
8,IGHV2-03*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,1,IGHV2
11,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...,295,0,IGHV2
15,IGHV2-05*01,GGTCAGACTCTGACAGAATCTGAACCAGCTGTTAAAAGACCTGGAG...,298,0,IGHV2
17,IGHV2-02*02,GGTCAGACTCTGACAGAATCTGAACCAGCTGTTAGAAGACCTGGAG...,295,0,IGHV2
22,IGHV2-01*02,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,0,IGHV2


In [10]:
df_V_ref_genome[df_V_ref_genome['family'] == 'IGHV5']

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,IGHV5-01*01,GGTCAGTCCCTCACCTCCTCWGAGCCAGTGGTCCACAGAGCAGGAG...,289,1,IGHV5
19,IGHV5-01*02,GGTCAGTCCCTCACCTCCTCAGAGCCAGTGGTCCACAGAGCAGGAG...,289,0,IGHV5


In [11]:
# lets start with the id=8 

In [12]:
# id = 8, IGHV2-03*01
# Bifucarte and check if bifurcation is not other template
str_gene_orig_id_8 = df_V_ref_genome.loc[8, ('value')]
str_V_IGHV2_03__01A = str_gene_orig_id_8.replace('Y', 'T')
str_V_IGHV2_03__01AA = str_gene_orig_id_8.replace('Y', 'C')
str_V_IGHV2_03__01A, str_V_IGHV2_03__01AA
# df_V_ref_genome[df_V_ref_genome['family'] == 'IGHV5']['value']

('GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAGAACCCCACAGACTCACCTGTACTGGTTCTGGTTTCACACTCAGCAGCTATGGAATGGCCTGGATCAGACAGGCAGCTGGAAAAGGACTGGAGTGGATTGCTATTATCTACAGCAGTGGTAGCGTCTTCTACTCTCAGTCAGTCCAAGGCCGGTTCTCCATCTCCAGAGAGAACAGCAGAAATCAGGTGTATCTGCAGATGAACACTCTGACCTCTGAGGATTCTGCTGTTTATTATTGTGCTCGAGAGC',
 'GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAGAACCCCACAGACTCACCTGTACTGGTTCTGGTTTCACACTCAGCAGCTATGGAATGGCCTGGATCAGACAGGCAGCTGGAAAAGGACTGGAGTGGATCGCTATTATCTACAGCAGTGGTAGCGTCTTCTACTCTCAGTCAGTCCAAGGCCGGTTCTCCATCTCCAGAGAGAACAGCAGAAATCAGGTGTATCTGCAGATGAACACTCTGACCTCTGAGGATTCTGCTGTTTATTATTGTGCTCGAGAGC')

In [13]:
# Check if there is a repetition

In [14]:
df_V_ref_genome[df_V_ref_genome['value'] == str_V_IGHV2_03__01A]

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [15]:
df_V_ref_genome[df_V_ref_genome['value'] == str_V_IGHV2_03__01AA]

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [16]:
df_V_ref_genome.loc[8]

name                                                 IGHV2-03*01
value          GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...
len                                                          295
num_no_ACGT                                                    1
family                                                     IGHV2
Name: 8, dtype: object

In [17]:
# Now add it to the main dataframe

In [18]:
df_V_ref_genome.loc[23] = ['IGHV2-03*01A',  str_V_IGHV2_03__01A, len(str_V_IGHV2_03__01A), 0, 'IGHV2']
df_V_ref_genome.loc[24] = ['IGHV2-03*01AA',  str_V_IGHV2_03__01AA, len(str_V_IGHV2_03__01AA), 0, 'IGHV2']

In [19]:
df_V_ref_genome

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,289,0,IGHV1
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,0,IGHV2
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,275,0,IGHV1
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,301,0,IGHV3
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,298,0,IGHV2
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,272,0,IGHV4
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
8,IGHV2-03*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,1,IGHV2
9,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,289,0,IGHV1


In [20]:
# Therefore create new 

In [21]:
# id = 10, IGHV5-01*01
# Bifurcate

In [22]:
# W -> A or T
str_gene_orig_id_10 = df_V_ref_genome.loc[10, ('value')]
str_V_IGHV5_01__01A = str_gene_orig_id_10.replace('W', 'T')
str_V_IGHV5_01__01AA = str_gene_orig_id_10.replace('W', 'A')
str_V_IGHV5_01__01A, str_V_IGHV5_01__01AA

('GGTCAGTCCCTCACCTCCTCTGAGCCAGTGGTCCACAGAGCAGGAGAGCCAGCCTCTCTGTCCTGTCAAGTACAAGGACTTCCTCTCGCCTGGCTGCACTGGATTCGTCAAAAACCAGGAAAAGGACTGGAATGGATCGGTCGCATCGATGGTGGAACTGGAACAATATTTGCATTAAGTGTCCAACACCAGTTCTCAATCACCAAAGATACTTCACAGAATGTTGTGTATCTATCTGTGAAGAGTCTGAAACAGGAAGACTCTGCTGTGTATTATTGTGCTCGAGAGC',
 'GGTCAGTCCCTCACCTCCTCAGAGCCAGTGGTCCACAGAGCAGGAGAGCCAGCCTCTCTGTCCTGTCAAGTACAAGGACTTCCTCTCGCCTGGCTGCACTGGATTCGTCAAAAACCAGGAAAAGGACTGGAATGGATCGGTCGCATCGATGGTGGAACTGGAACAATATTTGCATTAAGTGTCCAACACCAGTTCTCAATCACCAAAGATACTTCACAGAATGTTGTGTATCTATCTGTGAAGAGTCTGAAACAGGAAGACTCTGCTGTGTATTATTGTGCTCGAGAGC')

In [23]:
df_V_ref_genome[df_V_ref_genome['family'] == 'IGHV5']['value']

id
10    GGTCAGTCCCTCACCTCCTCWGAGCCAGTGGTCCACAGAGCAGGAG...
19    GGTCAGTCCCTCACCTCCTCAGAGCCAGTGGTCCACAGAGCAGGAG...
Name: value, dtype: object

In [24]:
df_V_ref_genome[df_V_ref_genome['value'] == str_V_IGHV5_01__01A]

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [25]:
print(df_V_ref_genome.loc[19]['value'])
print(str_V_IGHV5_01__01AA)
p3.str_seq_to_np_seq(df_V_ref_genome.loc[19]['value']) - p3.str_seq_to_np_seq(str_V_IGHV5_01__01A)

GGTCAGTCCCTCACCTCCTCAGAGCCAGTGGTCCACAGAGCAGGAGAGCCAGCCTCTCTGTCCTGTCAAGTACAAGGACTTCCTCTCGCCTGGCTGCACTGGATTCGTCAAAAACCAGGAAAAGGACTGGAATGGATCGGTCGCATCGATGGTGGAACTGGAACAATATTTGCATCAAGTGTCCAACACCAGTTCTCAATCACCAAAGATACTTCACAGAATGTTGTGTATCTATCTGTGAAGAGTCTGAAACAGGAAGACTCTGCTGTGTATTATTGTGCCAGAGAGC
GGTCAGTCCCTCACCTCCTCAGAGCCAGTGGTCCACAGAGCAGGAGAGCCAGCCTCTCTGTCCTGTCAAGTACAAGGACTTCCTCTCGCCTGGCTGCACTGGATTCGTCAAAAACCAGGAAAAGGACTGGAATGGATCGGTCGCATCGATGGTGGAACTGGAACAATATTTGCATTAAGTGTCCAACACCAGTTCTCAATCACCAAAGATACTTCACAGAATGTTGTGTATCTATCTGTGAAGAGTCTGAAACAGGAAGACTCTGCTGTGTATTATTGTGCTCGAGAGC


array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, -3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [26]:
df_V_ref_genome.loc[25] = ['IGHV5-01*01A',  str_V_IGHV5_01__01A, len(str_V_IGHV5_01__01A), 0, 'IGHV5']
df_V_ref_genome.loc[26] = ['IGHV5-01*01AA',  str_V_IGHV5_01__01AA, len(str_V_IGHV5_01__01AA), 0, 'IGHV5']

In [27]:
df_V_ref_genome = df_V_ref_genome.drop([8,10])
df_V_ref_genome

Unnamed: 0_level_0,name,value,len,num_no_ACGT,family
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,289,0,IGHV1
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,295,0,IGHV2
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,275,0,IGHV1
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,301,0,IGHV3
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,298,0,IGHV2
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,272,0,IGHV4
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,292,0,IGHV1
9,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,289,0,IGHV1
11,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...,295,0,IGHV2


In [28]:
df_V_ref_genome_new = df_V_ref_genome.reset_index().drop(columns=['id', 'len', 'num_no_ACGT', 'family'])
df_V_ref_genome_new.index.name = 'id'
df_V_ref_genome_new

Unnamed: 0_level_0,name,value
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...


In [29]:
# write it in fasta file
p3.write_ref_genome_files_from_dataframe(df_V_ref_genome_new, 'v_genes_to_multiple_alignment.fasta')

#### Get anchors positions

In [30]:
# Take a look to the amino acid sequences of these genes.
# df_V_ref_genome_new['aa_seq'] = df_V_ref_genome_new['value'].apply(lambda x: p3.dna_translate(x) )
# df_V_ref_genome_new['aa_seq'].apply(lambda x: x[-20:])
df_V_ref_genome_new['aa_seq'] = df_V_ref_genome_new['value'].apply(lambda x: p3.dna_translate(x) ).apply(lambda x: x[-20:])
df_V_ref_genome_new



Unnamed: 0_level_0,name,value,aa_seq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,TLNGQNMQPEDSAVYYCARE
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,TLNGQNMQPEDSAVYYCTRR
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,YLQMNTLTSEDSAVYYCARE
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,VTLNGQNMQPEDSAVYYCAR
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,TITGQNLQPEDTAVYYCVRY
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,FLQMNSLKPEDSAVYYCARE
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,SSVSADEQLEDSVVYYCARR
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,TLNGQNLQHEDSAVYYCARE
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,NLIGQNMQPEDSAVYYCARE
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...,FLQMNSLKTEDSAVYYCARE


In [31]:
# Function to get the last Cysteine nucleotide position to define the CDR3 2ndC anchor (JUNCTION imgt label)
def getCystineAnchor(str_nt_seq):
    aa_seq = p3.dna_translate(str_nt_seq)
    np_aa_seq = np.array(list(aa_seq))
    iii = np.argwhere(np_aa_seq == 'C')
    return int(3*iii[-1])

eee = getCystineAnchor(df_V_ref_genome_new['value'].loc[0])
df_V_ref_genome_new['value'].loc[0][eee:]



'TGTGCCAGAGAGC'

In [32]:
df_V_ref_genome_new['anchor_index'] = df_V_ref_genome_new['value'].apply(lambda x: getCystineAnchor(x))
df_V_ref_genome_new.drop(columns=['aa_seq'], inplace=True)
df_V_ref_genome_new

Unnamed: 0_level_0,name,value,anchor_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,276
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,279
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,282
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,264
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,288
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,285
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,258
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,279
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,276
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...,282


In [33]:
p3.dna_translate( df_V_ref_genome_new['value'].loc[0][ df_V_ref_genome_new['anchor_index'].loc[0]: ] )



'CARE'

In [34]:
# Get functionality from row
def get_V_Functionality(row):
    str_nt_V_gene_segment = row['value'][:row['anchor_index']]
    str_aa_V_gene_segment = p3.dna_translate(str_nt_V_gene_segment)
    # print(str_aa_V_gene_segment, len(str_nt_V_gene_segment), len(str_nt_V_gene_segment) % 3 )
    if '*' in list( str_aa_V_gene_segment ):
        return 'P'
    else:
        return 'F'

In [35]:
df_V_ref_genome_new['function'] = df_V_ref_genome_new.apply(lambda row: get_V_Functionality(row), axis=1)
df_V_ref_genome_new

Unnamed: 0_level_0,name,value,anchor_index,function
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,276,F
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,279,F
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAG...,282,F
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,264,F
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGAC...,288,F
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAG...,285,F
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACA...,258,P
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTC...,279,F
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTC...,276,F
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAG...,282,F


In [36]:
df_V_ref_genome_new['function'].value_counts()

F    22
P     3
Name: function, dtype: int64

### J genes

In [37]:
import pygor3 as p3
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [38]:
fln_J_killifish_IGH_fasta = 'Killifish_genome_template/j_no_imgt.fasta'
df_J_killifish_IGH = p3.utils.get_dataframe_from_fasta(fln_fasta=fln_J_killifish_IGH_fasta)
df_J_killifish_IGH

Unnamed: 0_level_0,name,value
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,IGHJ1-01*01,GTGCTTTAGACAACTGGGGAAAAGGAACGGAGGTTACTGTTCAACCTG
1,IGHJ1-02*21,ATGACTACTTTGACTACTGGGGAAAAGGAACAATGGTGACGGTCACATCAG
2,IGHJ1-03*22,ACCGTGGGGTAAAGGGACAACAGTCACGGTCAAAACAG
3,IGHJ1-04*01,ACGGTGCTCTTGACTACTGGGGTAAAGGGACCGCAGTCACTGTAACATCAG
4,IGHJ1-05*24,ACAACGCTTTTGACTACTGGGGAAAAGGAACAACGGTCACCGTCACTTCAG
5,IGHJ1-06*01,CTACGATGCTTTTGACTACTGGGGGAAAAGGACGATGGTCACGTCACTTCAG
6,IGHJ1-07*01,TTAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG
7,IGHJ1-08*27,TTACCACGCAGCTTTGGACTACTGGGGAAAAGGGACGACGGTCACCGTCACCTCAG
8,IGHJ1-09*28,TCTACGCTGCTTTTGACTACTGGGGTAAAGGTACAACGGTAACCGTTTCATCAG
9,IGHJ2-06*02,ATAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG


In [39]:
fln_J_killifish_IGH_fasta

'Killifish_genome_template/j_no_imgt.fasta'

In [40]:
from Bio.Align.Applications import ClustalwCommandline
cline = ClustalwCommandline("clustalw", infile=fln_J_killifish_IGH_fasta)
stdout, stderr = cline()

from Bio import AlignIO
J_killifish_IGH_align = AlignIO.read(fln_J_killifish_IGH_fasta.split('.fasta')[0]+".aln", "clustal")
# print(J_killifish_IGH_align)

df_J_killifish_IGH_copy = df_J_killifish_IGH.copy()

aln_ids = list()
aln_alnseq = list()
for aling_i in J_killifish_IGH_align:
    aln_id = (df_J_killifish_IGH_copy['name'][ df_J_killifish_IGH_copy['name'] == aling_i.description]).index[0]
    aln_ids.append(aln_id)
    aln_alnseq.append(str(aling_i.seq))
df_J_killifish_IGH_copy['aln'] = pd.Series(aln_alnseq, index=aln_ids)
# df_J_killifish_IGH_copy[['name', 'anchor_index', 'function', 'aa_seq', 'aln']].to_csv('J_mouse_TRB.csv', sep='\t')
tmp_anchor_pos_aln = df_J_killifish_IGH_copy['aln'].loc[0].find('TGG')
df_J_killifish_IGH_copy['anchor_index'] = df_J_killifish_IGH_copy['aln'].apply(lambda x: (tmp_anchor_pos_aln - x[:tmp_anchor_pos_aln].count('-')) )
# df_J_killifish_IGH_copy.apply(lambda row: len(row['value'][row['anchor_index']:]) % 3, axis=1)
df_J_killifish_IGH_copy['function'] = 'F'
df_J_killifish_IGH_copy

# df_J_killifish_IGH_copy['aln'].loc[0].count('-')

Unnamed: 0_level_0,name,value,aln,anchor_index,function
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,IGHJ1-01*01,GTGCTTTAGACAACTGGGGAAAAGGAACGGAGGTTACTGTTCAACCTG,--------GTGCTTTAGACAACTGGGGAAAAGGAACGGAGGTTACTGTTCAACCTG,14,F
1,IGHJ1-02*21,ATGACTACTTTGACTACTGGGGAAAAGGAACAATGGTGACGGTCACATCAG,-----ATGACTACTTTGACTACTGGGGAAAAGGAACAATGGTGACGGTCACATCAG,17,F
2,IGHJ1-03*22,ACCGTGGGGTAAAGGGACAACAGTCACGGTCAAAACAG,-----------------ACCG-TGGGGTAAAGGGACAACAGTCACGGTCAAAACAG,4,F
3,IGHJ1-04*01,ACGGTGCTCTTGACTACTGGGGTAAAGGGACCGCAGTCACTGTAACATCAG,-----ACGGTGCTCTTGACTACTGGGGTAAAGGGACCGCAGTCACTGTAACATCAG,17,F
4,IGHJ1-05*24,ACAACGCTTTTGACTACTGGGGAAAAGGAACAACGGTCACCGTCACTTCAG,-----ACAACGCTTTTGACTACTGGGGAAAAGGAACAACGGTCACCGTCACTTCAG,17,F
5,IGHJ1-06*01,CTACGATGCTTTTGACTACTGGGGGAAAAGGACGATGGTCACGTCACTTCAG,---CTACGATGCTTTTGACTACTGGGGGAAAAGGACGATGGTCAC-GTCACTTCAG,19,F
6,IGHJ1-07*01,TTAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,--TTAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,20,F
7,IGHJ1-08*27,TTACCACGCAGCTTTGGACTACTGGGGAAAAGGGACGACGGTCACCGTCACCTCAG,TTACCACGCAGCTTTGGACTACTGGGGAAAAGGGACGACGGTCACCGTCACCTCAG,22,F
8,IGHJ1-09*28,TCTACGCTGCTTTTGACTACTGGGGTAAAGGTACAACGGTAACCGTTTCATCAG,--TCTACGCTGCTTTTGACTACTGGGGTAAAGGTACAACGGTAACCGTTTCATCAG,20,F
9,IGHJ2-06*02,ATAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,--ATAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,20,F


In [41]:
df_J_ref_genome_new = df_J_killifish_IGH_copy[['name', 'value', 'anchor_index', 'function']]
df_J_ref_genome_new

Unnamed: 0_level_0,name,value,anchor_index,function
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,IGHJ1-01*01,GTGCTTTAGACAACTGGGGAAAAGGAACGGAGGTTACTGTTCAACCTG,14,F
1,IGHJ1-02*21,ATGACTACTTTGACTACTGGGGAAAAGGAACAATGGTGACGGTCACATCAG,17,F
2,IGHJ1-03*22,ACCGTGGGGTAAAGGGACAACAGTCACGGTCAAAACAG,4,F
3,IGHJ1-04*01,ACGGTGCTCTTGACTACTGGGGTAAAGGGACCGCAGTCACTGTAACATCAG,17,F
4,IGHJ1-05*24,ACAACGCTTTTGACTACTGGGGAAAAGGAACAACGGTCACCGTCACTTCAG,17,F
5,IGHJ1-06*01,CTACGATGCTTTTGACTACTGGGGGAAAAGGACGATGGTCACGTCACTTCAG,19,F
6,IGHJ1-07*01,TTAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,20,F
7,IGHJ1-08*27,TTACCACGCAGCTTTGGACTACTGGGGAAAAGGGACGACGGTCACCGTCACCTCAG,22,F
8,IGHJ1-09*28,TCTACGCTGCTTTTGACTACTGGGGTAAAGGTACAACGGTAACCGTTTCATCAG,20,F
9,IGHJ2-06*02,ATAACTGGGCTTTCGACTACTGGGGAAAAGGGACGATGGTAACGGTGACTTCAG,20,F


In [42]:
df_V_ref_genome_new

Unnamed: 0_level_0,name,value,anchor_index,function
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCAGTAGCTATGGAACATCTTGGATCAGACAGCCTGAAGGAAAAGGACTGGAGTGGATCGGGTGGAAATATACTGGAGACTCTTCCTATAAAGAGTCGCTGAAGAACAAGTTCAGCATTGATTTAGACCCTTCCAGTAAAACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCAGCTGTGTATTACTGTGCCAGAGAGC,276,F
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCAGTGGCTACTGGACACACTGGATCAGACAGCCTGCAGGAAAAGGACTGGAATGGATCGGTGAAGGATGCTGTGGAAGCTCCACTAACTACAAAGATTCTTTCAGAAACAAGTTCAGCATTTCAATGGAAACTTCCAGTAACACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTACCAGAAGGC,279,F
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAGAACCCCACAGACTCACCTGTACTGGTTCTGGTTTCACACTCAGCAGCTATGGAATGGCCTGGATCAGACAGGCAGCTGGAAAAGGACTGGAGTGGATTGCTATTATCTACAGCAGTGGTAGCGTCTTCTACTCTCAGTCAGTCCAAGGCCGGTTCTCCATCTCCAGAGGGAACAGCAGAAATCAGGTGTATCTGCAGATGAACACTCTGACCTCTGAGGATTCTGCTGTTTATTATTGTGCTCGAGAGC,282,F
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATGATGTCAGCGACTACTGGACAGCTTGGATCAGACAGCCTGCAGGAAAAGAACTGGAGTGGATCAGTTCTGACGAAATCATCAAATATTCATTAAATGATACGTTCAGCGTTGATTTAGACTCTTCCAGTAACACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTGCCAGAGC,264,F
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGACAGTCTGTGACCATCACCTGTCGGCTCTCTGGTTACTCTGTGACTGATGGCTATGGAACAGGTTGGATCAGACAGAGAGAAGGAAAAGCACCAGATTATATTTTCCATATGTGGGGAAGCAATGGAGATTTCTACCAAAACGATGCTCTGAAGAACAAGTTCAGCTACAGCAGAGACACGTCTGCAGGAACAGTGACAATAACAGGACAGAACCTGCAGCCTGAAGACACAGCTGTGTATTACTGTGTGAGATACC,288,F
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAGAATCGCACAAGCTGACCTGTACTGGTTCTGGTTACACATTCAGTAGTTATGCGATGGTCTGGGTCAGACAGGCTCGTGGAAAAGGACTGGAGTGGATCACCTACATCAGCAGAAGTGGTGACACTCAGTTCTACTCCCAGTCAGTTAAGGGCCGGTTCACCATCTCCAGAGACAACAACCAACAGCAGGTGTTTCTGCAGATGAACAGTCTGAAGCCTGAAGATTCTGCTGTTTATTATTGCGCTCGAGAGC,285,F
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACACACTGACCTGGACATGTGCAGGAATATCAGATCAGCTGGATCAGACAGGCTGAAGGAAAAGTACCAGAGTGGGTCACACACATTTCTGCTACCAGTGGAACCATCATATGTTATTCTCCATCAGTGCAGAACCGCTTCACCGTTTCCAGAGACAACAACAAGGATCAAGTGTATCTGCTGATGAGCAGCTTGAAGACTCTGTAGTTTATTATTGTGCTCGAAGAGC,258,P
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCACCAGCTCCAGGACACACTGGATCAGACAGCCTGCAGGAAAAACACTGGAATATATCTGTAGTGCACATATTGGACATGCCACATACGTAAAAGATTCTCTCAAAAACAAGTTCAGCATTAATTTAGACTCTTCCAGTAAAACAGTGACTCTGAACGGACAGAACTTGCAGCATGAAGACTCTGCTGTGTATTACTGTGCCAGAGAGC,279,F
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATGATGTCAGGAGGTATGCAACAGCTTGGATCAGACAGCCCGCAGGAAAAGGACTGGAATGGATCGGGTGGAAATCTGCTGGAGACTCTCGCCATAAAGAGTCACTGAAGAACAAGTTCAACATTGATTTAGACTCTTCCAGTAAGACAGTGAATCTGATTGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTGCCAGAGAGC,276,F
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAGAATCCCACAGACTGACCTGTACAACATCTGGAATCTTCTTCAGCAGCTGCTGGATGGCCTGGATCAGACAGGCTCCTGGGAAAGGTCTAGAGTGGGTCGCTACTGATGCTAACGGTGGTACCAACTACTACTCTGAGTCAGTCCGAGGCAGGTTCACCGTCTCCAGAGACAACAGCAGAGAGCAGCTGTTTCTGCAGATGAACAGTCTGAAGACTGAAGATTCTGCTGTTTATTATTGTGCTCGAGAGT,282,F


In [43]:
# df_V_ref_genome_new_productiveonly

# Create a model and save it

In [44]:
mdl0 = p3.IgorModel.make_default_VDJ(df_V_ref_genome_new, df_D_ref_genome, df_J_ref_genome_new)
mdl0

<pygor3.IgorIO.IgorModel at 0x7f6e712fd690>

In [45]:
mdl0.write_mdldata_dir('mdl0_killifish')

Writing model parms in file  mdl0_killifish/models/model_parms.txt
Writing model marginals in file  mdl0_killifish/models/model_marginals.txt


In [46]:
mdl_0 = p3.IgorModel.load_from_directory('mdl0_killifish')
mdl_0.genomic_dataframe_dict['V']

Reading Parms filename from:  mdl0_killifish/models/model_parms.txt
Reading Marginals filename from:  mdl0_killifish/models/model_marginals.txt
Anchors loaded from mdl0_killifish/ref_genome/V_gene_CDR3_anchors.csv and mdl0_killifish/ref_genome/J_gene_CDR3_anchors.csv


Unnamed: 0_level_0,name,value,anchor_index,function
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,IGHV1-01*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCAGTAGCTATGGAACATCTTGGATCAGACAGCCTGAAGGAAAAGGACTGGAGTGGATCGGGTGGAAATATACTGGAGACTCTTCCTATAAAGAGTCGCTGAAGAACAAGTTCAGCATTGATTTAGACCCTTCCAGTAAAACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCAGCTGTGTATTACTGTGCCAGAGAGC,276,F
1,IGHV1-02*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCAGTGGCTACTGGACACACTGGATCAGACAGCCTGCAGGAAAAGGACTGGAATGGATCGGTGAAGGATGCTGTGGAAGCTCCACTAACTACAAAGATTCTTTCAGAAACAAGTTCAGCATTTCAATGGAAACTTCCAGTAACACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTACCAGAAGGC,279,F
2,IGHV2-01*01,GGTCAGACACTGACTGAGTCTGAACCAGTGGTTAAAAGACCTGGAGAACCCCACAGACTCACCTGTACTGGTTCTGGTTTCACACTCAGCAGCTATGGAATGGCCTGGATCAGACAGGCAGCTGGAAAAGGACTGGAGTGGATTGCTATTATCTACAGCAGTGGTAGCGTCTTCTACTCTCAGTCAGTCCAAGGCCGGTTCTCCATCTCCAGAGGGAACAGCAGAAATCAGGTGTATCTGCAGATGAACACTCTGACCTCTGAGGATTCTGCTGTTTATTATTGTGCTCGAGAGC,282,F
3,IGHV1-03*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATGATGTCAGCGACTACTGGACAGCTTGGATCAGACAGCCTGCAGGAAAAGAACTGGAGTGGATCAGTTCTGACGAAATCATCAAATATTCATTAAATGATACGTTCAGCGTTGATTTAGACTCTTCCAGTAACACAGTGACTCTGAATGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTGCCAGAGC,264,F
4,IGHV3-01*01,AGTATTGATCTGATCCAGCCAGCCTCTAAAGCTGTGCAGCCTGGACAGTCTGTGACCATCACCTGTCGGCTCTCTGGTTACTCTGTGACTGATGGCTATGGAACAGGTTGGATCAGACAGAGAGAAGGAAAAGCACCAGATTATATTTTCCATATGTGGGGAAGCAATGGAGATTTCTACCAAAACGATGCTCTGAAGAACAAGTTCAGCTACAGCAGAGACACGTCTGCAGGAACAGTGACAATAACAGGACAGAACCTGCAGCCTGAAGACACAGCTGTGTATTACTGTGTGAGATACC,288,F
5,IGHV2-02*01,GGTCAGACTCTGACTCAGTCTGAACCAGTGGTTAAAAGACCAGGAGAATCGCACAAGCTGACCTGTACTGGTTCTGGTTACACATTCAGTAGTTATGCGATGGTCTGGGTCAGACAGGCTCGTGGAAAAGGACTGGAGTGGATCACCTACATCAGCAGAAGTGGTGACACTCAGTTCTACTCCCAGTCAGTTAAGGGCCGGTTCACCATCTCCAGAGACAACAACCAACAGCAGGTGTTTCTGCAGATGAACAGTCTGAAGCCTGAAGATTCTGCTGTTTATTATTGCGCTCGAGAGC,285,F
6,IGHV4-01*01,TAAACTTGATCTGAAGCAGCGCTGAAACGACCTGGAGAATCCCACACACTGACCTGGACATGTGCAGGAATATCAGATCAGCTGGATCAGACAGGCTGAAGGAAAAGTACCAGAGTGGGTCACACACATTTCTGCTACCAGTGGAACCATCATATGTTATTCTCCATCAGTGCAGAACCGCTTCACCGTTTCCAGAGACAACAACAAGGATCAAGTGTATCTGCTGATGAGCAGCTTGAAGACTCTGTAGTTTATTATTGTGCTCGAAGAGC,258,P
7,IGHV1-04*01,TGTGAACAGCTGACTCAGCCAGCCTCTGTGACTGTGCAGCCAGGTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATTCTCTCACCAGCTCCAGGACACACTGGATCAGACAGCCTGCAGGAAAAACACTGGAATATATCTGTAGTGCACATATTGGACATGCCACATACGTAAAAGATTCTCTCAAAAACAAGTTCAGCATTAATTTAGACTCTTCCAGTAAAACAGTGACTCTGAACGGACAGAACTTGCAGCATGAAGACTCTGCTGTGTATTACTGTGCCAGAGAGC,279,F
8,IGHV1-05*01,TGTGAACAGCTGACTCAACCAGCCTCTGTGACTGTGCAGCCAGCTCAGCGTCTGACCATCAGCTGTCAGGTCTCTTATGATGTCAGGAGGTATGCAACAGCTTGGATCAGACAGCCCGCAGGAAAAGGACTGGAATGGATCGGGTGGAAATCTGCTGGAGACTCTCGCCATAAAGAGTCACTGAAGAACAAGTTCAACATTGATTTAGACTCTTCCAGTAAGACAGTGAATCTGATTGGACAGAACATGCAGCCTGAAGACTCTGCTGTGTATTACTGTGCCAGAGAGC,276,F
9,IGHV2-04*01,GGCCAGACTCTGACAGAATCTGAACCAGCGGTTAGAAGACCTGGAGAATCCCACAGACTGACCTGTACAACATCTGGAATCTTCTTCAGCAGCTGCTGGATGGCCTGGATCAGACAGGCTCCTGGGAAAGGTCTAGAGTGGGTCGCTACTGATGCTAACGGTGGTACCAACTACTACTCTGAGTCAGTCCGAGGCAGGTTCACCGTCTCCAGAGACAACAGCAGAGAGCAGCTGTTTCTGCAGATGAACAGTCTGAAGACTGAAGATTCTGCTGTTTATTATTGTGCTCGAGAGT,282,F


In [47]:
# Manually copy the functions

In [48]:
# W = Adenine / Thymine
# Y = Cytosine / Thymine (pyrimidine)