In [1]:
import pandas as pd 

In [3]:
from collections import Counter

def read_fasta(file_path):
    """
    Reads a FASTA file and returns a list of sequences.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    sequences = []
    seq = ''
    for line in lines:
        if line.startswith('>'):
            if seq:
                sequences.append(seq)
                seq = ''
        else:
            seq += line.strip()
    if seq:
        sequences.append(seq)
    
    return sequences

def consensus_sequence(sequences):
    """
    Generates a consensus sequence from a list of aligned sequences.
    """
    consensus = ''
    for i in range(len(sequences[0])):
        column = [seq[i] for seq in sequences]
        most_common = Counter(column).most_common(1)[0][0]
        consensus += most_common if most_common != '-' else '-'  # Replacing gaps with 'X'
    
    return consensus

# Read the sequences from the FASTA file
sequences = read_fasta('./FabB/FabB_1line_MSA.fa')

# Generate the consensus sequence
consensus_seq = consensus_sequence(sequences)
print(sequences)

print(consensus_seq)


['--MRRVVVTGLGIVSSIGNNAAEVLAALKSGTSGIEACPEMAEHGFRSQVAGTLK--INVA---NHVDKRTLRFMGPGAAYAHIAMAEAIADAGLEESDIV-NPRTGLVAGSGGPSTSAILTAHQTVLKTGATKRIGPFAVPKCMSSTISANLATAYKIRGINYSITSACSTSLHCIGNAAEQIMMGKQDVMFAGGGEELDWTLSCLFDAMGAMSSKYNDAPETASRAFDADRDGFVISGGGGILVLEDLEHAQARGAKIYAEVTGYAATSDGHDMVAPSGEGGERAMRLALQSLPEGRKVSYINAHGTSTPVGDVGEVEAVRRVFGQ-GSTPPISSTKSMTGHAQGAAGALEAIFSLLMLDNDFIARSINVQTLDPALDASEIALETVHNAGLDSVMTNSFGFGGTNGSMILSKFKG---', '--MRRVVVTGLGIVSSIGNNAEEVLASLKAGKSGITANEDMKEYGFRSQVAGAVN--IDIK---SHVDKRALRFMGPGAAYAYIAMGQAIADSGLEESDVV-NPRTGLIAGSGGPSTSAMLTAHQSVLKTMSTKRVGPFAVPKCMASTISANLATAYQIKGINYSITSACSTSLHCIGSASEQIMMGKQDVMFAGGGEELDWTLSCLFDAMGAMSSKFNDTPDKASRAFDANRDGFVISGGGAVLVLEELEHAKARGAKIYAEVTGFAATSDGADMVAPSGEGGERAMRLALQTLEEGRKVGYINAHGTSTPVGDVGEVEAVRRVFGE-GNTPVISSTKSMTGHSQGATGAQEAVYCLLALENDFIIPSINVETLDPAIHEGEIATKLVENAGLDTVMTNSFGFGGTNGSMLLSKYHG---', '--MRRVVVTGLGVVSSIGNNAEEVLASLKAGKSGIRANEAMAEHGFRSQIAGDLK--IDVA---EHVDKRTLRFMGPGAAYAHIALSQAIADAGLEESDVV-NPRTGVVAGSGGPSTSAMFAAHQTVLKTGATKRIGPFAVPKCMSST

In [2]:
#run this if you want to see ties and positions of ties in the consensus sequence 

def consensus_sequence_with_ties_and_gaps(sequences):
    """
    Generates a consensus sequence from a list of aligned sequences.
    Marks ties with '#' and gaps with 'X'.
    """
    consensus = ''
    tie_positions = {}
    for i in range(len(sequences[0])):
        column = [seq[i] for seq in sequences]
        freqs = Counter(column)
        most_common = freqs.most_common()

        # Check for ties
        if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
            tied_residues = [residue for residue, count in most_common if count == most_common[0][1]]
            tie_positions[i] = tied_residues
            consensus += '#'  # Placeholder for tie
        else:
            consensus += most_common[0][0] if most_common[0][0] != '-' else 'X'  # Replacing gaps with 'X'
    
    return consensus, tie_positions

# Generate the consensus sequence with tie detection and gap representation
consensus_seq_with_ties_gaps, tie_positions = consensus_sequence_with_ties_and_gaps(sequences)

# Displaying the tie information
tie_positions, consensus_seq_with_ties_gaps


({382: ['P', 'N'], 394: ['N', 'T']},
 'XXMRRVVITGLGIVSSIGNNKQEVLASLKEGRSGITFSEEFAEMGMRSQVAGNVKXXLDPAXXXELIDRKVLRFMGDAAAYAYLSMQQAIADAGLTEEQVSXNPRTGLIAGSGGGSSRNQVEAADILRXXRGVKRVGPYAVTKTMASTVSACLATPFKIKGVNYSISSACATSAHCIGNAVEQIQLGKQDIVFAGGGEELHWELSCEFDAMGALSTKYNDTPEKASRAYDANRDGFVIAGGGGMVVVEELEHALARGAKIYAEIVGYGATSDGYDMVAPSGEGAVRCMKQAMATVDXXGPIDYINTHGTSTPVGDVKELEAIREVFGDXXNTPAISSTKSMTGHSLGAAGVQEAIYSLLMLENGFIAPSINIEELDEQAEGM#IVTERTXEAEL#TVMSNSFGFGGTNATLVFRKYNGXXX')

In [None]:
XXMRRVVITGLGIVSSIGNNKQEVLASLKEGRSGITFSEEFAEMGMRSQVAGNVKXXLDPAXXXELIDRKVLRFMGDAAAYAYLSMQQAIADAGLTEEQVSXNPRTGLIAGSGGGSSRNQVEAADILRXXRGVKRVGPYAVTKTMASTVSACLATPFKIKGVNYSISSACATSAHCIGNAVEQIQLGKQDIVFAGGGEELHWELSCEFDAMGALSTKYNDTPEKASRAYDANRDGFVIAGGGGMVVVEELEHALARGAKIYAEIVGYGATSDGYDMVAPSGEGAVRCMKQAMATVDXXGPIDYINTHGTSTPVGDVKELEAIREVFGDXXNTPAISSTKSMTGHSLGAAGVQEAIYSLLMLENGFIAPSINIEELDEQAEGMNIVTERTXEAELNTVMSNSFGFGGTNATLVFRKYNGXXX

echo 'XXMRRVVITGLGIVSSIGNNKQEVLASLKEGRSGITFSEEFAEMGMRSQVAGNVKXXLDPAXXXELIDRKVLRFMGDAAAYAYLSMQQAIADAGLTEEQVSXNPRTGLIAGSGGGSSRNQVEAADILRXXRGVKRVGPYAVTKTMASTVSACLATPFKIKGVNYSISSACATSAHCIGNAVEQIQLGKQDIVFAGGGEELHWELSCEFDAMGALSTKYNDTPEKASRAYDANRDGFVIAGGGGMVVVEELEHALARGAKIYAEIVGYGATSDGYDMVAPSGEGAVRCMKQAMATVDXXGPIDYINTHGTSTPVGDVKELEAIREVFGDXXNTPAISSTKSMTGHSLGAAGVQEAIYSLLMLENGFIAPSINIEELDEQAEGMNIVTERTXEAELNTVMSNSFGFGGTNATLVFRKYNGXXX' | sed 's/X//g'


In [None]:
#consensus sequence without gap 
'MRRVVITGLGIVSSIGNNKQEVLASLKEGRSGITFSEEFAEMGMRSQVAGNVKLDPAELIDRKVLRFMGDAAAYAYLSMQQAIADAGLTEEQVSNPRTGLIAGSGGGSSRNQVEAADILRRGVKRVGPYAVTKTMASTVSACLATPFKIKGVNYSISSACATSAHCIGNAVEQIQLGKQDIVFAGGGEELHWELSCEFDAMGALSTKYNDTPEKASRAYDANRDGFVIAGGGGMVVVEELEHALARGAKIYAEIVGYGATSDGYDMVAPSGEGAVRCMKQAMATVDGPIDYINTHGTSTPVGDVKELEAIREVFGDNTPAISSTKSMTGHSLGAAGVQEAIYSLLMLENGFIAPSINIEELDEQAEGMNIVTERTEAELNTVMSNSFGFGGTNATLVFRKYNG'