In [1]:
import pandas as pd
import os 


## Map sequencing results (accessions) from pipeline iteration 1 to 2 

In [4]:
df_all=pd.read_csv("SI tables/SI table 2.tsv",sep="\t")

In [3]:
df_all_origin=pd.read_csv("og_all_protein_tax_info.csv",index_col=0)
dfo=df_all_origin[df_all_origin.representative_seq=='TRUE']
common=list(dfo[dfo.index.isin(df_all[df_all.representative].protein)].index)
new_old={df_all[df_all.protein==x].cluster.values[0]:dfo.loc[x,'cluster'] for x in common}
new_old=pd.DataFrame.from_dict(new_old,orient='index')
new_old.index.name='new'
new_old.columns=['old']
new_old.to_csv('new_old_mapping.tsv')

In [27]:
seq_og=pd.read_csv("SI tables/og_SI Table 11 - sequencing_results.tsv",sep="\t")

seq_og['reference protein accession ']=seq_og['reference protein accession '].replace({'KAH9406650':'KAH9406650.1','XP_019558301.2':' XP_019540707.3','XP_053634600.1':'XP_053634600.2','XP_042908388.1':'XP_015919340.2','XP_027231698.1':'XP_069990332.1','XP_027232647.1':'XP_027232647.2'})
td=seq_og[seq_og['reference protein accession '].isin(df_all.protein)]
len(td)


44

## Make multiple sequence alignments

In [28]:
from Bio import Entrez, SeqIO

def append_protein_to_fasta(accession, fasta_file, email='rkapoor@g.harvard.edu'):

    Entrez.email = email
    
    # Fetch the protein sequence from the protein database in FASTA format
    with Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text") as handle:
        record = SeqIO.read(handle, "fasta")
    
    # Append the fetched record to the specified FASTA file
    with open(fasta_file, "a") as output_handle:
        SeqIO.write(record, output_handle, "fasta")
    
    print(f"Protein {accession} has been appended to {fasta_file}.")



In [7]:
from Bio import SeqIO

In [31]:
##load amino acid translations from the genbank submission
input_file = open(f"genbank_translations_may_2025.fa")
fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

In [38]:
!mkdir 'PCR_result_alignments'
td=seq_og[seq_og['reference protein accession '].isin(df_all.protein)]
td=td[td['sequence name '].isin(fasta_dict.keys())]
for a in set(td['reference protein accession ']):
    !mkdir 'PCR_result_alignments'/"$a"
    f=open(f'PCR_result_alignments/{a}/protein.fa','a')
    for x in list(td[td['reference protein accession ']==a]['sequence name ']):
        f.write(f">{x}\n")
        s=str(fasta_dict[x].seq)
        f.write(f"{s}\n")
    f.close()
    
    append_protein_to_fasta(a,f'PCR_result_alignments/{a}/protein.fa')

  

Protein XP_035708168.1 has been appended to PCR_result_alignments/XP_035708168.1/protein.fa.
Protein XP_046453153.1 has been appended to PCR_result_alignments/XP_046453153.1/protein.fa.
Protein XP_045027829.1 has been appended to PCR_result_alignments/XP_045027829.1/protein.fa.
Protein XP_046403459.1 has been appended to PCR_result_alignments/XP_046403459.1/protein.fa.
Protein XP_050513096.1 has been appended to PCR_result_alignments/XP_050513096.1/protein.fa.
Protein XP_042220148.1 has been appended to PCR_result_alignments/XP_042220148.1/protein.fa.
Protein XP_029735553.1 has been appended to PCR_result_alignments/XP_029735553.1/protein.fa.
Protein XP_046456339.1 has been appended to PCR_result_alignments/XP_046456339.1/protein.fa.
Protein XP_027232647.2 has been appended to PCR_result_alignments/XP_027232647.2/protein.fa.
Protein XP_023332299.1 has been appended to PCR_result_alignments/XP_023332299.1/protein.fa.
Protein XP_037026007.1 has been appended to PCR_result_alignments/XP_0

In [39]:
## run MUSCLE to obtain MSA
for a in set(td['reference protein accession ']):
    !sh pcr_align.sh "$a"


muscle 5.1.linux64 []  1056Gb RAM, 112 cores
Built May 13 2023 06:17:56
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 4 seqs, avg length 296, max 394



00:00 7.7Mb  CPU has 112 cores, running 1 threads
00:00 11Mb    100.0% Calc posteriors
00:00 12Mb    100.0% Consistency (1/2)
00:00 12Mb    100.0% Consistency (2/2)
00:00 12Mb    100.0% UPGMA5           
00:00 12Mb    100.0% Refining

muscle 5.1.linux64 []  1056Gb RAM, 112 cores
Built May 13 2023 06:17:56
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 2 seqs, avg length 688, max 869



00:00 7.7Mb  CPU has 112 cores, running 1 threads
00:00 7.7Mb   100.0% Calc posteriors
00:00 7.7Mb   100.0% UPGMA5         

muscle 5.1.linux64 []  1056Gb RAM, 112 cores
Built May 13 2023 06:17:56
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 3 seqs, avg length 1719, max 2477



00:00 7.7Mb  CPU has 112 cores, running 1 threads
00:00 7.7Mb   100.0% Calc posteriors
00:01 34Mb    100.0% Cons

In [40]:

from pymsaviz import MsaViz
import ast
for ac in set(td['reference protein accession ']):
    !mkdir 'PCR_result_alignments'/"$ac"
    
    mv = MsaViz(f"PCR_result_alignments/{ac}/MSA_protein.fa", wrap_length=100)
    meta=[]
    hgt=[]

    for inter in ast.literal_eval(df_all[df_all.protein==ac].Metazoan_intervals.values[0]):
        meta.append(inter)
    for inter in ast.literal_eval(df_all[df_all.protein==ac].HGT_intervals.values[0]):
        hgt.append(inter)
    for m in meta:
        mv.add_text_annotation((int(m[0]), int(m[1])), "Metazoan", text_color="orange", range_color="orange")
    for h in hgt:
        mv.add_text_annotation((int(h[0]), int(h[1])), "HGT", text_color="purple", range_color="purple")
    mv.savefig(f"PCR_result_alignments/{ac}/MSA_protein.png", dpi=400)


mkdir: cannot create directory ‘PCR_result_alignments/XP_035708168.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_046453153.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_045027829.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_046403459.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_050513096.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_042220148.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_029735553.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_046456339.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_027232647.2’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_023332299.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_037026007.1’: File exists
mkdir: cannot create directory ‘PCR_result_alignments/XP_023346081.1’: File exists
mkdi

In [42]:
from fpdf import FPDF
from PIL import Image
import os

# Path to the directory containing SVG files (here used for PNG images)
input_directory = "PCR_result_alignments"

# Output PDF file
pdf_output = "PCR_result_alignments_merged.pdf"

# Create a PDF document
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)

# For demonstration, we'll process one image.
# Replace `set(mapdf.reference_protein)` with your actual iterable if needed.
for a in set(td['reference protein accession ']):
    pdf.add_page()
    pdf.set_font("Arial", size=11)
    pdf.cell(200, 10, txt=a, ln=True, align='C')
    
    # Construct the full path to the image file
    image_path = os.path.join(input_directory, a, "MSA_protein.png")
    
    # Open the image using Pillow to get its dimensions (in pixels)
    with Image.open(image_path) as img:
        img_width, img_height = img.size

    # Convert pixels to millimeters.
    # Assuming an image DPI of 96. (1 inch = 25.4 mm, so conversion factor = 25.4 / DPI.)
    dpi = 96  # Modify this if your images have a different DPI setting.
    conv_factor = 25.4 / dpi
    img_width_mm = img_width * conv_factor
    img_height_mm = img_height * conv_factor

    # Define available width and height for the image within the PDF page.
    # This calculation accounts for the margins. You can modify these values as needed.
    max_width = pdf.w - 2 * pdf.l_margin
    # Use the top margin offset (pdf.t_margin + 20) for header space, then subtract the bottom margin.
    max_height = pdf.h - (pdf.t_margin + 20) - pdf.b_margin

    # Calculate the scaling factor to fit the image within the available space,
    # keeping the original aspect ratio.
    scale = min(max_width / img_width_mm, max_height / img_height_mm, 1)
    new_width = img_width_mm * scale
    new_height = img_height_mm * scale

    # Place the image onto the page with the new dimensions.
    pdf.image(image_path, x=pdf.l_margin, y=pdf.t_margin + 20, w=new_width, h=new_height)
    
   

# Save the PDF to a file.
pdf.output(pdf_output)
print(f"PDF generated successfully: {pdf_output}")


PDF generated successfully: PCR_result_alignments_merged.pdf


In [None]:
input_file = open(f"genbank_translations_may_2025.fa")
fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
fasta_dict = {x:fasta_dict[x] for x in td['sequence name ']}
records = [fasta_dict[i] for i in fasta_dict]
with open("genbank_submission_07_23_2025/genbank_translations.fa", "w") as out:
    SeqIO.write(records, out, "fasta")

In [None]:
input_file = open(f"genbank_translations_may_2025.fa")
fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
fasta_dict = {x:fasta_dict[x] for x in td['sequence name ']}
records = [fasta_dict[i] for i in fasta_dict]
with open("genbank_submission_07_23_2025/genbank_translations.fa", "w") as out:
    SeqIO.write(records, out, "fasta")

## add genbank accessions

In [None]:
import pandas as pd

In [None]:
td=pd.read_csv("SI tables/sequencing_results.tsv",sep="\t",index_col=0)
gb=pd.read_csv("genbank_submission_07_23_2025/genbank_submission_07_23_2025_genbank_submit.tsv",sep="\t",index_col=0)

In [None]:
##load mapping between submitted sequence accession and genbank id, provided in email from genbank
gen="""
SUB15265677 Dapuli_chimera2_seq1        PV989571
SUB15265677 Brcopr_chimera1_seq1        PV989572
SUB15265677 Dapuli_chimera3_seq1        PV989573
SUB15265677 Typutr_chimera1_seq1        PV989574
SUB15265677 Typutr_chimera1_seq2        PV989575
SUB15265677 Damagn_chimera1_seq1        PV989576
SUB15265677 Damagn_chimera1_seq2        PV989577
SUB15265677 Iseleg_chimera1_seq1        PV989578
SUB15265677 Chquad_chimera1_seq1        PV989579
SUB15265677 Divirg_chimera1_seq1        PV989580
SUB15265677 Dapule_chimera1_seq1        PV989581
SUB15265677 Dapule_chimera2_seq1        PV989582
SUB15265677 Dapule_chimera3_seq1        PV989583
SUB15265677 Iseleg_chimera2_seq1        PV989584
SUB15265677 Patepi_chimera1_seq1        PV989585
SUB15265677 Hoamer_chimera1_seq1        PV989586
SUB15265677 Brcopr_chimera2_seq1        PV989587
SUB15265677 Brcopr_chimera3_seq1        PV989588
SUB15265677 Brcopr_chimera4_seq1        PV989589
SUB15265677 Brcopr_chimera5_seq1        PV989590
SUB15265677 Brcopr_chimera6_seq1        PV989591
SUB15265677 Focand_chimera2_seq1        PV989592
SUB15265677 Focand_chimera4_seq1        PV989593
SUB15265677 Focand_chimera4_seq2        PV989594
SUB15265677 Focand_chimera4_seq3        PV989595
SUB15265677 Damagn_chimera2_seq1        PV989596
SUB15265677 Ixscap_chimera1_seq1        PV989597
SUB15265677 Aealbo_chimera1_seq1        PV989598
SUB15265677 Pevann_chimera1_seq1        PV989599
SUB15265677 Pevann_chimera2_seq1        PV989600
SUB15265677 Eucaro_chimera1_seq1        PV989601
SUB15265677 Eucaro_chimera2_seq1        PV989602
SUB15265677 Eucaro_chimera3_seq1        PV989603
SUB15265677 Eucaro_chimera3_seq2        PV989604
SUB15265677 Eucaro_chimera3_seq3        PV989605
SUB15265677 Eucaro_chimera4_seq1        PV989606
SUB15265677 Eucaro_chimera5_seq1        PV989607
SUB15265677 Aeaegy_chimera1_seq1        PV989608
SUB15265677 Badors_chimera2_seq1        PV989609
SUB15265677 Aeaegy_chimera2_seq1        PV989610"""

In [None]:
genmap={x.split("        ")[0].split(" ")[1]:x.split("        ")[1].strip() for x in gen.split("\n")[1:]}

In [None]:
for index, row in td.iterrows():
    if index in genmap:
        td.loc[index,'genbank accession']=genmap[index]
td.to_csv("SI tables/sequencing_results.tsv",sep="\t")


In [8]:
import pandas as pd
s13=pd.read_csv("SI tables/SI_table_13_sequencing_results.tsv",sep="\t",index_col=0)
for index, row in s13.iterrows():
    s13.loc[index,'chimera cluster #']=df_all[df_all.protein==row['reference protein accession']]['cluster'].values[0]

In [10]:
s13.to_csv("SI tables/SI_table_13_sequencing_results.tsv",sep="\t")