Mutation_frequency_visualize

In [5]:
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline

In [None]:
# sequence loading, have to put a fasta file
sequence = [str(record.seq) for record in SeqIO.parse("spike_sequences.fasta", "fasta")]

In [None]:
# align sequence using clustal omega
clustal_omega = ClustalOmegaCommandline(infile="spike_sequences.fasta", outfile="aligned_spike.fasta", verbose=True, auto=True)
stdout, stderr = clustal_omega()

In [None]:
from Bio import AlignIO
import pandas as pd

In [None]:
# mutation analysis
alignment = AlignIO.read("aligned_spike.fasta", "fasta")

# compare sequence to reference
reference = str(alignment[0].seq)
mutations = []

for record in alignment[1:]:
    sample_id = record.id
    seq = str(record.seq)
    for i, (ref_base, sample_base) in enumerate(zip(reference, seq)):
        if ref_base != sample_base:
            mutations.append({"Sample" : sample_id, "Position" : i + 1, "Reference" : ref_base, "Variant" : sample_base})

In [None]:
# create mutation dataframe
mutation_df = pd.DataFrame(mutations)
print(mutation_df.head())

In [None]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# plot mutation frequency
mutation_frequency = mutation_df['Position'].value_counts()
sns.barplot(x=mutation_frequency.index, y=mutation_frequency.values)
plt.title("Mutation Frequency in SARS-CoV-2 Spike Protein")
plt.xlabel("Position")
plt.ylabel("Frequency")
plt.show()