In [3]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

sequence1 = "MKQLEDKVEELLSKNYHLENEVARLKKLV"
sequence2 = "MKQLEDKVEELLSKNYHLENEVARLKKLVGER"

alignments = pairwise2.align.globalxx(sequence1, sequence2)

for alignment in alignments:
    print(format_alignment(*alignment))


MKQLEDKVEELLSKNYHLENEVARLKKLV---
|||||||||||||||||||||||||||||   
MKQLEDKVEELLSKNYHLENEVARLKKLVGER
  Score=29





In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1.4)

In [3]:
sequence = pd.read_csv("sequences.csv")
sequence

Unnamed: 0,seq_id;specie;sequence
0,5kel|ag;Zaire ebolavirus (strain Mayinga-76) (...
1,5kel|ag;Zaire ebolavirus (128952);EAIVNAQPKCNP...
2,5kel|ab;Homo sapiens (9606);EVQLQESGGGLMQPGGSM...
3,5kel|ab;Homo sapiens (9606);DIQMTQSPASLSVSVGET...
4,5kel|ab;Homo sapiens (9606);DVKLLESGGGLVQPGGSL...
...,...
3028,6jht|ab;Human hepatitis A virus Hu/Australia/H...
3029,6jht|ab;Human hepatitis A virus Hu/Australia/H...
3030,5w1m|ab;Homo sapiens (9606);DIVMTQSPESLAVSLGER...
3031,5w1m|ab;Homo sapiens (9606);VQLVESGGGVVHPGRSLR...


In [4]:
sequence.columns

Index(['seq_id;specie;sequence'], dtype='object')

In [5]:
sequence[['seq_id', 'species', 'sequence']] = sequence['seq_id;specie;sequence'].str.split(';', expand=True)
sequence


Unnamed: 0,seq_id;specie;sequence,seq_id,species,sequence
0,5kel|ag;Zaire ebolavirus (strain Mayinga-76) (...,5kel|ag,Zaire ebolavirus (strain Mayinga-76) (128952),IPLGVIHNSTLQVSDVDKLVCRDKLSSTNQLRSVGLNLEGNGVATD...
1,5kel|ag;Zaire ebolavirus (128952);EAIVNAQPKCNP...,5kel|ag,Zaire ebolavirus (128952),EAIVNAQPKCNPNLHYWTTQDEGAAIGLAWIPYFGPAAEGIYTEGL...
2,5kel|ab;Homo sapiens (9606);EVQLQESGGGLMQPGGSM...,5kel|ab,Homo sapiens (9606),EVQLQESGGGLMQPGGSMKLSCVASGFTFSNYWMNWVRQSPEKGLE...
3,5kel|ab;Homo sapiens (9606);DIQMTQSPASLSVSVGET...,5kel|ab,Homo sapiens (9606),DIQMTQSPASLSVSVGETVSITCRASENIYSSLAWYQQKQGKSPQL...
4,5kel|ab;Homo sapiens (9606);DVKLLESGGGLVQPGGSL...,5kel|ab,Homo sapiens (9606),DVKLLESGGGLVQPGGSLKLSCAASGFSLSTSGVGVGWFRQPSGKG...
...,...,...,...,...
3028,6jht|ab;Human hepatitis A virus Hu/Australia/H...,6jht|ab,Human hepatitis A virus Hu/Australia/HM175/197...,DIVLTQSPAIMSASPGEKVTMTCSATSGLSYIHWYQQKSGTSPKRW...
3029,6jht|ab;Human hepatitis A virus Hu/Australia/H...,6jht|ab,Human hepatitis A virus Hu/Australia/HM175/197...,EVKLVESGGGLVKPGGSLKLSCAASAFTITTYGMSWVRQTPEKRLE...
3030,5w1m|ab;Homo sapiens (9606);DIVMTQSPESLAVSLGER...,5w1m|ab,Homo sapiens (9606),DIVMTQSPESLAVSLGERATINCKSSQSVLYSSRSDNKDYLAWYQQ...
3031,5w1m|ab;Homo sapiens (9606);VQLVESGGGVVHPGRSLR...,5w1m|ab,Homo sapiens (9606),VQLVESGGGVVHPGRSLRLSCAASGFTFGTSIMHWVRQAPGKGMQW...


In [6]:
sequence = sequence.drop(columns=['seq_id;specie;sequence'])


In [7]:
sequence['sequence']

0       IPLGVIHNSTLQVSDVDKLVCRDKLSSTNQLRSVGLNLEGNGVATD...
1       EAIVNAQPKCNPNLHYWTTQDEGAAIGLAWIPYFGPAAEGIYTEGL...
2       EVQLQESGGGLMQPGGSMKLSCVASGFTFSNYWMNWVRQSPEKGLE...
3       DIQMTQSPASLSVSVGETVSITCRASENIYSSLAWYQQKQGKSPQL...
4       DVKLLESGGGLVQPGGSLKLSCAASGFSLSTSGVGVGWFRQPSGKG...
                              ...                        
3028    DIVLTQSPAIMSASPGEKVTMTCSATSGLSYIHWYQQKSGTSPKRW...
3029    EVKLVESGGGLVKPGGSLKLSCAASAFTITTYGMSWVRQTPEKRLE...
3030    DIVMTQSPESLAVSLGERATINCKSSQSVLYSSRSDNKDYLAWYQQ...
3031    VQLVESGGGVVHPGRSLRLSCAASGFTFGTSIMHWVRQAPGKGMQW...
3032    ELPSLCMLNNSFYYMRGGVNTFLIRVSDISVLMKEYDVSIYEPEDL...
Name: sequence, Length: 3033, dtype: object

In [36]:
pep='ALDFEQEMT'

In [15]:

# read the matrix a csv file on github
nlf = pd.read_csv('https://raw.githubusercontent.com/dmnfarrell/epitopepredict/master/epitopepredict/mhcdata/NLF.csv', index_col=0)

def show_matrix(m):
    #display a matrix
    cm = sns.light_palette("seagreen", as_cmap=True)
    display(m.style.background_gradient(cmap=cm))

def nlf_encode(seq, nlf_df):
    encoded_sequences = []
    for amino_acid in seq:
        try:
            # Access the column corresponding to the amino acid
            encoded_value = nlf_df[amino_acid].values
            encoded_sequences.append(encoded_value)
        except KeyError:
            # Handle the case where the amino acid is not a column in nlf_df
            print(f"Encoding for amino acid '{amino_acid}' not found.")
            # Appending a vector of zeros with the same length as the encoding
            encoded_sequences.append(np.zeros((nlf_df.shape[0],)))
    
    sequence_vector = np.concatenate(encoded_sequences)
    return sequence_vector

# You will need to load your sequence dataframe here
# sequence_df = pd.read_csv('your_sequence_file.csv')

# Assuming the sequence dataframe has a column 'sequence' that contains the sequences
sequence['encoded'] = sequence['sequence'].apply(lambda x: nlf_encode(x, nlf))





Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid 'X' not found.
Encoding for amino acid '

In [10]:
nlf

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V
1,0.42,1.65,1.68,0.81,2.7,1.71,1.56,1.32,0.13,1.52,1.29,2.03,1.72,2.37,1.41,1.47,0.3,2.83,0.7,1.33
2,2.07,1.4,0.3,0.13,0.32,1.11,0.48,2.05,1.5,0.45,1.21,0.26,0.85,0.23,0.27,1.11,0.68,1.79,0.95,1.39
3,0.67,0.01,0.49,1.36,1.19,0.08,0.87,0.6,1.22,0.39,0.25,1.22,0.34,0.09,1.09,0.27,0.88,0.16,0.36,0.15
4,0.01,0.88,0.15,0.63,1.37,0.15,0.02,0.31,0.52,0.36,0.96,0.98,0.44,0.37,0.77,0.13,0.23,0.14,0.6,0.4
5,1.1,0.08,0.09,0.15,0.04,0.11,0.07,0.61,1.14,0.01,0.18,0.05,0.01,0.19,0.87,0.15,0.1,0.42,0.09,0.04
6,0.32,0.07,0.59,0.1,0.18,0.45,0.13,0.58,0.45,0.55,0.06,0.32,0.8,0.04,0.33,0.22,0.23,0.84,0.06,0.27
7,0.2,0.6,0.06,0.45,0.64,0.11,0.22,0.0,0.13,0.06,0.04,0.1,0.16,0.03,0.04,0.09,0.03,0.13,0.55,0.07
8,0.09,0.53,0.02,0.31,0.21,0.08,0.15,0.3,0.04,0.1,0.0,0.73,0.05,0.06,0.27,0.05,0.01,0.06,0.01,0.12
9,0.2,0.1,0.14,0.1,0.26,0.02,0.09,0.44,0.1,0.02,0.09,0.11,0.05,0.14,0.43,0.05,0.14,0.04,0.28,0.1
10,0.09,0.01,0.0,0.03,0.35,0.25,0.1,0.14,0.07,0.08,0.26,0.19,0.3,0.14,0.06,0.14,0.16,0.18,0.17,0.06


In [45]:
nlf.to_csv(index=False)

'A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V\r\n0.42,1.65,1.68,0.81,2.7,1.71,1.56,1.32,0.13,1.52,1.29,2.03,1.72,2.37,1.41,1.47,0.3,2.83,0.7,1.33\r\n2.07,1.4,0.3,0.13,0.32,1.11,0.48,2.05,1.5,0.45,1.21,0.26,0.85,0.23,0.27,1.11,0.68,1.79,0.95,1.39\r\n0.67,0.01,0.49,1.36,1.19,0.08,0.87,0.6,1.22,0.39,0.25,1.22,0.34,0.09,1.09,0.27,0.88,0.16,0.36,0.15\r\n0.01,0.88,0.15,0.63,1.37,0.15,0.02,0.31,0.52,0.36,0.96,0.98,0.44,0.37,0.77,0.13,0.23,0.14,0.6,0.4\r\n1.1,0.08,0.09,0.15,0.04,0.11,0.07,0.61,1.14,0.01,0.18,0.05,0.01,0.19,0.87,0.15,0.1,0.42,0.09,0.04\r\n0.32,0.07,0.59,0.1,0.18,0.45,0.13,0.58,0.45,0.55,0.06,0.32,0.8,0.04,0.33,0.22,0.23,0.84,0.06,0.27\r\n0.2,0.6,0.06,0.45,0.64,0.11,0.22,0.0,0.13,0.06,0.04,0.1,0.16,0.03,0.04,0.09,0.03,0.13,0.55,0.07\r\n0.09,0.53,0.02,0.31,0.21,0.08,0.15,0.3,0.04,0.1,0.0,0.73,0.05,0.06,0.27,0.05,0.01,0.06,0.01,0.12\r\n0.2,0.1,0.14,0.1,0.26,0.02,0.09,0.44,0.1,0.02,0.09,0.11,0.05,0.14,0.43,0.05,0.14,0.04,0.28,0.1\r\n0.09,0.01,0.0,0.03,0.35,0.25,0.1,0.14,0.07,0.08,0.26,0

In [16]:
sequence

Unnamed: 0,seq_id,species,sequence,encoded
0,5kel|ag,Zaire ebolavirus (strain Mayinga-76) (128952),IPLGVIHNSTLQVSDVDKLVCRDKLSSTNQLRSVGLNLEGNGVATD...,"[1.52, 0.45, 0.39, 0.36, 0.01, 0.55, 0.06, 0.1..."
1,5kel|ag,Zaire ebolavirus (128952),EAIVNAQPKCNPNLHYWTTQDEGAAIGLAWIPYFGPAAEGIYTEGL...,"[1.56, 0.48, 0.87, 0.02, 0.07, 0.13, 0.22, 0.1..."
2,5kel|ab,Homo sapiens (9606),EVQLQESGGGLMQPGGSMKLSCVASGFTFSNYWMNWVRQSPEKGLE...,"[1.56, 0.48, 0.87, 0.02, 0.07, 0.13, 0.22, 0.1..."
3,5kel|ab,Homo sapiens (9606),DIQMTQSPASLSVSVGETVSITCRASENIYSSLAWYQQKQGKSPQL...,"[0.81, 0.13, 1.36, 0.63, 0.15, 0.1, 0.45, 0.31..."
4,5kel|ab,Homo sapiens (9606),DVKLLESGGGLVQPGGSLKLSCAASGFSLSTSGVGVGWFRQPSGKG...,"[0.81, 0.13, 1.36, 0.63, 0.15, 0.1, 0.45, 0.31..."
...,...,...,...,...
3028,6jht|ab,Human hepatitis A virus Hu/Australia/HM175/197...,DIVLTQSPAIMSASPGEKVTMTCSATSGLSYIHWYQQKSGTSPKRW...,"[0.81, 0.13, 1.36, 0.63, 0.15, 0.1, 0.45, 0.31..."
3029,6jht|ab,Human hepatitis A virus Hu/Australia/HM175/197...,EVKLVESGGGLVKPGGSLKLSCAASAFTITTYGMSWVRQTPEKRLE...,"[1.56, 0.48, 0.87, 0.02, 0.07, 0.13, 0.22, 0.1..."
3030,5w1m|ab,Homo sapiens (9606),DIVMTQSPESLAVSLGERATINCKSSQSVLYSSRSDNKDYLAWYQQ...,"[0.81, 0.13, 1.36, 0.63, 0.15, 0.1, 0.45, 0.31..."
3031,5w1m|ab,Homo sapiens (9606),VQLVESGGGVVHPGRSLRLSCAASGFTFGTSIMHWVRQAPGKGMQW...,"[1.33, 1.39, 0.15, 0.4, 0.04, 0.27, 0.07, 0.12..."


In [17]:
# Splitting the 'species' column into two columns: 'species_name' and 'strain_identifier'
sequence[['species_name', 'strain_identifier']] = sequence['species'].str.split(' \(', expand=True)


sequence['strain_identifier'] = sequence['strain_identifier'].str.replace(')', '')

# Rename columns 
sequence = sequence.rename(columns={'seq_id': 'sequence_id'})

sequence

ValueError: Columns must be same length as key