In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import glob

### metadata: genome

In [3]:
df_0 = pd.read_excel(
    "../dataset/AllSatellitesTables.xlsx", sheet_name="PICI", engine="openpyxl"
)
print(df_0.shape)
print(df_0.columns)

In [None]:
metadata = pd.read_csv("../dataset/PICI_GenomicNucInfo_060723.csv")
print(metadata.shape)
print(metadata.columns)

In [72]:
metadata["PICI_ID_allsets"] = metadata["PICI_ID"].str.split(".Set", n=1).str[0]
cols = ["PICI_ID_allsets"] + [
    col for col in metadata.columns if col != "PICI_ID_allsets"
]
metadata = metadata[cols]
metadata = metadata.drop(columns=["Unnamed: 0"])

In [None]:
metadata

In [74]:
duplicated_sets = metadata.groupby("PICI_ID_allsets")["PICI_ID"].nunique()
duplicated_sets = duplicated_sets[duplicated_sets > 1].index
duplicate_rows = metadata[metadata["PICI_ID_allsets"].isin(duplicated_sets)]
duplicate_rows = duplicate_rows.sort_values(["PICI_ID_allsets", "PICI_ID"])

In [79]:
duplicate_rows.to_csv("../dataset/duplicate_picis.csv", index=False)

In [None]:
print(len(metadata["PICI_ID_allsets"].unique()))
print(len(metadata["PICI_ID"].unique()))
print(len(duplicate_rows["PICI_ID_allsets"].unique()))
print(len(duplicate_rows["PICI_ID"].unique()))
# 1435 - 1286 = 287 - 138 = 149

### metadata: protein

In [None]:
seqs = list(SeqIO.parse("../dataset/PICI_NucSequences_060723.fst", "fasta"))
print(len(seqs))
print("first sample:")
print(seqs[0])

In [None]:
seqs_prt_1 = list(
    SeqIO.parse(
        "../dataset/SatelliteProteomes/ACER001.0321.00001.C001.PICI.TypeB.variant0001.Set1.prt",
        "fasta",
    )
)
print(len(seqs_prt_1))
print(seqs_prt_1[0])

In [2]:
data_prt = []

prt_files = glob.glob("../dataset/SatelliteProteomes/*.prt")

for prt_file in prt_files:
    sequences = list(SeqIO.parse(prt_file, "fasta"))

    for seq in sequences:
        # Extract function (part after the last '|')
        function = (
            seq.description.split("|")[-1].strip() if "|" in seq.description else ""
        )

        # Extract PICI_ID (everything before the last underscore in Protein_ID)
        pici_id = "_".join(seq.id.split("_")[:-1])

        # Create dictionary for this entry
        entry = {
            "PICI_ID": pici_id,
            "Protein_ID": seq.id,
            "Function": function,
            "Description": seq.description,
        }

        data_prt.append(entry)

df_prt = pd.DataFrame(data_prt)

In [38]:
df_prt["PICI_ID_allsets"] = df_prt["PICI_ID"].str.split(".Set", n=1).str[0]
cols = ["PICI_ID_allsets"] + [col for col in df_prt.columns if col != "PICI_ID_allsets"]
df_prt = df_prt[cols]

In [40]:
df_prt.to_csv("../dataset/PICI_proteins.csv", index=False)

In [3]:
df_prt = pd.read_csv("../dataset/PICI_proteins.csv")

In [None]:
df_prt.head()

In [None]:
print(len(df_prt["PICI_ID_allsets"].unique()))
print(len(df_prt["PICI_ID"].unique()))
print(len(df_prt["Protein_ID"].unique()))
print(len(df_prt["Description"].unique()))
print(len(df_prt["Function"].unique()))

### protein function

In [8]:
function_counts = pd.read_csv("../results/proteins_all_function_counts.tsv", sep="\t")

In [7]:
# function_counts = df_prt["Function"].value_counts().reset_index(name="Count")

In [None]:
function_counts

In [28]:
integrase_count = function_counts[function_counts["Function"].str.contains("integrase")]
integrase_count.to_csv("../dataset/proteins_integrases.csv", index=False)
alpa_count = function_counts[function_counts["Function"].str.contains("AlpA")]
alpa_count.to_csv("../dataset/proteins_AlpA.csv", index=False)
capsid_count = function_counts[function_counts["Function"].str.contains("capsid")]
capsid_count.to_csv("../dataset/proteins_capsid.csv", index=False)

In [5]:
pici_functions = df_prt.groupby("PICI_ID")["Function"].agg(list)


def check_functions(func_list):
    func_str = " ".join(func_list).lower()
    return "alpa" in func_str


# ("integrase" in func_str) and ('capsid' not in func_str)

filtered_picis = pici_functions[pici_functions.apply(check_functions)]

result_df = pd.DataFrame(
    {"PICI_ID": filtered_picis.index, "All_Functions": filtered_picis.values}
)

In [54]:
result_df.to_csv("../results/picis_integrase_alpa.csv", index=False)

### cfpici

In [None]:
df_prt = pd.read_csv("../dataset/PICI_proteins.csv")

In [None]:
# df_prt[
#     df_prt["PICI_ID"].str.contains("CFPICI") & df_prt["Function"].str.contains("capsid")
# ]
len(df_prt[df_prt["PICI_ID"].str.contains("CFPICI")]["PICI_ID"].unique())
# there are 30 cfpici among 1435 picis

In [None]:
seqs = list(SeqIO.parse("../dataset/PICI_NucSequences_060723.fst", "fasta"))
print(seqs[0])

In [None]:
for seq in seqs:
    if seq.description == "SIME002.0321.00024.C001.PICI.TypeB.variant0001.SetR1":
        print(seq)
        print(len(seq.seq))
        break

In [None]:
pici_30 = list(
    [
        "SIME002.0321.00024.C001.PICI.TypeB.variant0001.SetR1",
        "SIME002.0321.00009.C001.PICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00005.C001.PICI.TypeB.variant0001.SetR1",
        "LAPL001.0321.00074.C001.PICI.TypeB.variant0003.SetR1",
        "SIME002.0321.00018.C001.PICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00003.C001.PICI.TypeB.variant0001.SetR1",
        "LAPL001.0321.00073.C001.PICI.TypeB.variant0003.SetR1",
        "SIME002.0321.00002.C001.PICI.TypeB.variant0001.SetR1",
        "STAU002.0321.00280.C001.PICI.TypeB.variant0004.SetR1",
        "SPYA001.0321.00006.C001.PICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.00563.C001.PICI.TypeA.SetR1",
        "ESCO001.0321.00097.C001.PICI.TypeA.SetR1",
        "CIFR005.0321.00052.C001.PICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00004.C001.PICI.TypeB.variant0001.SetR1",
        "CIFR005.0321.00053.C001.PICI.TypeB.variant0001.SetR1",
        "DEVU001.0321.00001.C001.PICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.00309.C001.PICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.01281.C001.PICI.TypeA.SetR1",
        "DEVU001.0321.00001.C001.PICI.TypeB.variant0001.SetR2",
        "PAKO001.0321.00001.C001.PICI.TypeB.variant0004.SetR1",
        "DEVU001.0321.00004.C001.PICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00006.C001.PICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.00448.C001.PICI.TypeA.SetR1",
        "NIHA002.0321.00001.C001.PICI.TypeB.variant0001.SetR1",
        "RHET001.0321.00004.C001.PICI.TypeB.variant0001.SetR1",
        "CIFR005.0321.00101.C001.PICI.TypeB.variant0001.SetR1",
        "STAR001.0321.00001.C001.PICI.TypeB.variant0002.SetR1",
        "PAKO001.0321.00001.C002.PICI.TypeB.variant0004.SetR1",
        "CIAM001.0321.00007.C001.PICI.TypeB.variant0001.SetR1",
        "STSA003.0321.00008.C001.PICI.TypeB.variant0002.SetR1",
    ]
)
cfpici_30 = list(
    [
        "ESCO001.0321.01281.C001.CFPICI.TypeA.SetR1",
        "CIFR005.0321.00053.C001.CFPICI.TypeB.variant0001.SetR1",
        "STSA003.0321.00008.C001.CFPICI.TypeB.variant0002.SetR1",
        "DEVU001.0321.00001.C001.CFPICI.TypeB.variant0001.SetR1",
        "CIFR005.0321.00052.C001.CFPICI.TypeB.variant0001.SetR1",
        "DEVU001.0321.00004.C001.CFPICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00003.C001.CFPICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00004.C001.CFPICI.TypeB.variant0001.SetR1",
        "LAPL001.0321.00074.C001.CFPICI.TypeB.variant0003.SetR1",
        "ESCO001.0321.00097.C001.CFPICI.TypeA.SetR1",
        "DEVU001.0321.00001.C001.CFPICI.TypeB.variant0001.SetR2",
        "SPYA001.0321.00006.C001.CFPICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00005.C001.CFPICI.TypeB.variant0001.SetR1",
        "RHET001.0321.00004.C001.CFPICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.00309.C001.CFPICI.TypeB.variant0001.SetR1",
        "SIME002.0321.00002.C001.CFPICI.TypeB.variant0001.SetR1",
        "PAKO001.0321.00001.C002.CFPICI.TypeB.variant0004.SetR1",
        "STAU002.0321.00280.C001.CFPICI.TypeB.variant0004.SetR1",
        "SIME002.0321.00009.C001.CFPICI.TypeB.variant0001.SetR1",
        "STSI001.0321.00006.C001.CFPICI.TypeB.variant0001.SetR1",
        "CIFR005.0321.00101.C001.CFPICI.TypeB.variant0001.SetR1",
        "LAPL001.0321.00073.C001.CFPICI.TypeB.variant0003.SetR1",
        "SIME002.0321.00024.C001.CFPICI.TypeB.variant0001.SetR1",
        "ESCO001.0321.00563.C001.CFPICI.TypeA.SetR1",
        "STAR001.0321.00001.C001.CFPICI.TypeB.variant0002.SetR1",
        "ESCO001.0321.00448.C001.CFPICI.TypeA.SetR1",
        "CIAM001.0321.00007.C001.CFPICI.TypeB.variant0001.SetR1",
        "PAKO001.0321.00001.C001.CFPICI.TypeB.variant0004.SetR1",
        "NIHA002.0321.00001.C001.CFPICI.TypeB.variant0001.SetR1",
        "SIME002.0321.00018.C001.CFPICI.TypeB.variant0001.SetR1",
    ]
)
print(len(pici_30), len(cfpici_30))

In [None]:
# keep only 1405 picis from seqs excluding 30 cfpici
seqs = [seq for seq in seqs if seq.id not in cfpici_30]
len(seqs)

In [None]:
# 1405
# write to fasta
with open("../dataset/PICI_NucSequences_060723_no_cfpici.fst", "w") as f:
    for seq in seqs:
        f.write(f">{seq.id}\n{seq.seq}\n")

### function matrix

In [30]:
df_prt = pd.read_csv("../dataset/PICI_proteins.csv")

In [31]:
function_matrix = pd.pivot_table(
    df_prt,
    values="Protein_ID",
    index="PICI_ID",
    columns="Function",
    aggfunc="count",
    fill_value=0,
)

In [32]:
function_matrix.to_csv("../results/protein_function_matrix_old.csv", index=True)

In [None]:
print(function_matrix.sum().sum())  # should be same as df_prt.shape[0]
print(function_matrix.max().max())
print(function_matrix.min().min())
print(function_matrix.info())

In [None]:
# col_mask = function_matrix.columns.str.contains("integrase|alpa|capsid", case=False)
col_mask = function_matrix.columns.str.contains("alpa", case=False)
function_matrix_integrase_alpa_capsid = function_matrix.loc[:, col_mask]

In [None]:
function_matrix_integrase_alpa_capsid.to_csv(
    "../results/protein_function_matrix_integrase_alpa_capsid.csv", index=True
)

#### new matrix with pharokka annotations

In [None]:
protein_function_matrix_old = pd.read_csv(
    "../results/protein_function_matrix_old.csv", index_col=0
)
print(protein_function_matrix_old.shape)
print(protein_function_matrix_old.info())
print(protein_function_matrix_old.max().max())
print(protein_function_matrix_old.min().min())

In [None]:
phanotate_functions = pd.read_csv(
    "../results/pharokka_output/phanotate_functions.tsv", sep="\t"
)
print(phanotate_functions.shape)
print(phanotate_functions.columns)

In [19]:
function_matrix_new_unmerged = pd.pivot_table(
    phanotate_functions,
    values="gene_id",
    index="pici_id",
    columns="function",
    aggfunc="count",
    fill_value=0,
)

In [None]:
print(function_matrix_new_unmerged.shape)
print(function_matrix_new_unmerged.info())
print(function_matrix_new_unmerged.max().max())
print(function_matrix_new_unmerged.min().min())
function_matrix_new_unmerged.sum().sum()

In [21]:
function_matrix_new_unmerged.to_csv(
    "../results/protein_function_matrix_new_unmerged.csv", index=True
)

In [47]:
function_matrix_new_unmerged = pd.read_csv(
    "../results/protein_function_matrix_new_unmerged.csv", index_col=0
)

In [None]:
alpa_columns = [
    "AlpA family phage regulatory protein  (translation)",
    "AlpA family transcriptional regulator  (translation)",
]

function_matrix_new = function_matrix_new_unmerged.drop(index=pici_30)
alpa_data = function_matrix[alpa_columns].drop(index=cfpici_30)

print("Shape after removal:")
print("function_matrix_new:", function_matrix_new.shape)
print("alpa_data:", alpa_data.shape)

print("\nIndices identical:", (function_matrix_new.index == alpa_data.index).all())

function_matrix_new = pd.concat([function_matrix_new, alpa_data], axis=1).fillna(0)

print("\nFinal shape:", function_matrix_new.shape)
print("Number of unique indices:", len(function_matrix_new.index.unique()))

In [51]:
function_matrix_new.to_csv(
    "../results/protein_function_matrix_new_merged.csv", index=True
)


In [None]:
print(function_matrix_new.shape)
print(function_matrix_new.info())
print(function_matrix_new.max().max())
print(function_matrix_new.min().min())
function_matrix_new.sum().sum()

# mash

In [None]:
from Bio import SeqIO
import os

# Create directory for split files
os.makedirs("../dataset/split_sequences", exist_ok=True)

# Split each sequence into its own file
count = 0
for record in SeqIO.parse("../dataset/PICI_NucSequences_060723.fst", "fasta"):
    count += 1
    # Use the sequence ID as filename, replacing any problematic characters
    safe_filename = record.id.replace("/", "_").replace("\\", "_")
    output_file = f"../dataset/split_sequences/{safe_filename}.fasta"
    with open(output_file, "w") as output_handle:
        SeqIO.write(record, output_handle, "fasta")

print(f"Split {count} sequences into individual files")

In [None]:
mash_dist = pd.read_csv("../output/mash_distances.tab", sep="\t")
print(mash_dist.shape)
print(mash_dist.columns)

In [24]:
# add column names
mash_dist.columns = ["seq1", "seq2", "distance", "pvalue", "matching_hashes"]

In [None]:
mash_dist

In [None]:
import matplotlib.pyplot as plt

plt.hist(mash_dist["distance"], bins=1000)
plt.yscale("log")
plt.show()


# phage_satellites_functions

In [1]:
import pandas as pd

annotation = pd.read_parquet("../dataset/Phage_and_Satellites_Pann_Pcat_Pcol.pa")

In [3]:
annotation["pcat"].value_counts()

pcat
unknown_no_hit                                       1391498
DNA, RNA and nucleotide metabolism                    252511
tail                                                  182345
head and packaging                                    168373
other                                                  82561
lysis                                                  52519
connector                                              47674
transcription regulation                               46522
moron, auxiliary metabolic gene and host takeover      44573
unknown function                                       29353
integration and excision                               20609
Name: count, dtype: int64

In [2]:
annotation_satellites = annotation[annotation["what"] != "phage"]
annotation_satellites["pcat"].value_counts()

pcat
unknown function                                     29353
DNA, RNA and nucleotide metabolism                   11004
head and packaging                                    7883
integration and excision                              7056
transcription regulation                              6931
connector                                             1824
other                                                  474
moron, auxiliary metabolic gene and host takeover      375
tail                                                    14
lysis                                                   13
Name: count, dtype: int64

In [11]:
annotation[annotation["acc"].str.contains("STAU002_0321_00536_C001")]
# STAU002.0321.00536.C001

Unnamed: 0,acc,name,start,end,strand,pann,pcat,pcol,translation,what
0,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0001,2,569,-1,terminase small subunit,head and packaging,,MSELTAKQARFVNEYIRTLNVTQSAIKAGYSANSAHVTGCRLLKKP...,PICI
1,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0002,565,907,-1,hypothetical protein,unknown function,,MDKKQIKDFVCDYHKRTISDVLIDDEINTDEFFSIGDENSNEWMAD...,PICI
2,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0003,909,1437,-1,spore coat protein,head and packaging,,MKLLKTKNCLYYRNGDNKLSEYQLLTQFNPAFINKKIKMCEFQIES...,PICI
3,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0004,1487,1706,-1,hypothetical protein,unknown function,,METKYELNNTKKVANAFGLNEADTNLLINAVDLDIKNNMQEISSEL...,PICI
4,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0005,1723,2302,-1,hypothetical protein,unknown function,,MKTESYFKEYNQFVIDQQKAIQELEQERNALESKIKIDKSTYKQLI...,PICI
5,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0006,2313,2655,-1,hypothetical protein,unknown function,,MQSIAEKETYHLPTEHLQVFNVIKNTSNKYITKTKILNQLGYEYNS...,PICI
6,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0007,3104,3746,-1,hypothetical protein,unknown function,,MNLETIVNQFETRAGTLLRYYTGLLEHSKVQPCCFKLYNDPFDMVY...,PICI
7,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0008,3742,4123,-1,hypothetical protein,unknown function,,MDKEQLKKYIYEYVKEYKEIPIYQLEDLFKEINHDYIGRTSITHDK...,PICI
8,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0009,4432,6142,-1,DNA primase/helicase,"DNA, RNA and nucleotide metabolism",,METGKSDVLDKIEKINKKDSALQEIIPKGYEIEHHQCGVALYQLIP...,PICI
9,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0010,6155,7025,-1,DNA polymerase/primase,"DNA, RNA and nucleotide metabolism",,MNEIKLEYDTHVSVVHYESLDSRSFKSFSKPEWSKLVNKLSVPIEA...,PICI


In [None]:
print(annotation["acc"].nunique())
print(annotation["name"].nunique())
print(annotation["strand"].nunique())
print(annotation["pann"].nunique())
print(annotation["pcat"].nunique())
print(annotation["pcol"].nunique())
print(annotation["translation"].nunique())
print(annotation["what"].nunique())
print(annotation["translation"].nunique())

In [32]:
# annotation[annotation["pcat"] == "moron, auxiliary metabolic gene and host takeover"]

In [None]:
print(annotation["pcat"].unique())
print(annotation["pcol"].unique())
print(annotation["what"].unique())

In [None]:
annotation["pcat"].value_counts()
# leave out: unknown_no_hit
# start with tail

In [6]:
# filter out where pcat is unknown_no_hit
annotation = annotation[annotation["pcat"] != "unknown_no_hit"]

In [None]:
print(annotation.shape)
print(annotation["translation"].nunique())
print(annotation["pcat"].value_counts())

In [None]:
duplicated_translations = annotation[annotation.duplicated("translation", keep=False)]
for seq, group in duplicated_translations.groupby("translation"):
    print(f"Protein sequence: {seq[:30]}...")  # Print first 30 chars for brevity
    print(group[["name"]])
    print("-" * 40)

In [35]:
# target_seq = annotation.loc[
#     annotation["name"] == "EZLKZNME_CDS_0010", "translation"
# ].iloc[0]
# same_seq_rows = annotation[annotation["translation"] == target_seq]
# same_seq_rows

In [None]:
pcat_counts = annotation.groupby("translation")["pcat"].nunique()
multi_pcat_seqs = pcat_counts[pcat_counts > 1].index
rows_with_multi_pcat = annotation[annotation["translation"].isin(multi_pcat_seqs)]
rows_with_multi_pcat
# conclusion: the ones with same translation must have same pcat

"pcat":function, file name
"name": id
"pann": description
"translation": sequence

In [37]:
import os

# Ensure output directory exists (optional)
output_dir = "../dataset/fasta_by_pcat"
os.makedirs(output_dir, exist_ok=True)

for pcat_value in annotation["pcat"].unique():
    # Format the filename: lowercase, replace spaces/commas with underscores, remove other non-alphanumerics
    filename = pcat_value.lower().replace(",", "").replace(" ", "_") + ".fa"
    filepath = os.path.join(output_dir, filename)

    # Select rows for this pcat
    subset = annotation[annotation["pcat"] == pcat_value]

    with open(filepath, "w") as f:
        for _, row in subset.iterrows():
            # Prepare header and sequence
            header = (
                f">{row['name']} {row['pann']}"
                if pd.notna(row["pann"])
                else f">{row['name']}"
            )
            sequence = row["translation"]
            # Write to file
            f.write(f"{header}\n{sequence}\n")

# duplicates

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

seen_sequences = set()
unique_records = []

for record in SeqIO.parse("../dataset/combined_proteins.fa", "fasta"):
    if str(record.seq) not in seen_sequences:
        seen_sequences.add(str(record.seq))
        unique_records.append(record)

SeqIO.write(unique_records, "../dataset/unique_proteins.fa", "fasta")

# ids

In [None]:
annotation["pcat"].value_counts()

In [10]:
# save the names for each pcat
for pcat, group in annotation.groupby("pcat"):
    group["name"].to_csv(f"../dataset/pcat/{pcat}.csv", index=False)

In [16]:
unique_proteins = list(SeqIO.parse("../dataset/unique_proteins.fa", "fasta"))

In [None]:
unique_proteins[0]

In [23]:
# save all the ids of unique proteins to csv
list_of_ids = []
for protein in unique_proteins:
    list_of_ids.append(protein.id)
pd.DataFrame(list_of_ids, columns=["id"]).to_csv(
    "../dataset/pcat/all_unique_proteins.csv", index=False
)

# clusters

In [None]:
import json
import statistics

# Load the cluster mapping
with open("../dataset/protein_cluster_mapping.json", "r") as f:
    protein_to_cluster = json.load(f)

# Count the number of proteins in each cluster
from collections import Counter

sizes = Counter(protein_to_cluster.values()).values()
sizes = list(sizes)

# Print statistics using the statistics module
print(f"Total number of clusters: {len(sizes)}")
print(f"Smallest cluster: {min(sizes)}")
print(f"Largest cluster: {max(sizes)}")
print(f"Average cluster size: {statistics.mean(sizes):.2f}")
print(f"Median cluster size: {statistics.median(sizes)}")
if len(sizes) > 1:
    print(f"Standard deviation: {statistics.stdev(sizes):.2f}")
else:
    print("Standard deviation: N/A (only one cluster)")

In [8]:
import pandas as pd

cluster_sizes = Counter(protein_to_cluster.values())
df = pd.DataFrame(
    {
        "cluster_number": list(cluster_sizes.keys()),
        "cluster_size": list(cluster_sizes.values()),
    }
)
df = df.sort_values("cluster_size", ascending=False)

In [10]:
df.to_csv("../results/cluster_sizes.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

plt.hist(sizes, bins=1000)
plt.yscale("log")

# bacteria for demonstration 

In [3]:
frik_bacteria_proteins = list(
    SeqIO.parse(
        "../dataset/demonstration_samples/GCF_000175755.1/protein.faa",
        "fasta",
    )
)

In [4]:
print(len(frik_bacteria_proteins))
print(frik_bacteria_proteins[0])

5095
ID: WP_000002304.1
Name: WP_000002304.1
Description: WP_000002304.1 MULTISPECIES: alpha-D-ribose 1-methylphosphonate 5-phosphate C-P-lyase PhnJ [Enterobacteriaceae]
Number of features: 0
Seq('MANLSGYNFAYLDEQTKRMIRRAILKAVAIPGYQVPFGGREMPMPYGWGTGGIQ...KSQ')


In [5]:
def parse_gff(gff_file):
    records = []
    with open(gff_file) as f:
        for line in f:
            if line.startswith("#"):
                continue
            parts = line.strip().split("\t")
            if len(parts) < 9 or parts[2] != "CDS":
                continue
            contig = parts[0]
            start = int(parts[3])
            end = int(parts[4])
            strand = parts[6]
            attributes = parts[8]
            # Try to extract protein_id or ID
            protein_id = None
            for attr in attributes.split(";"):
                if attr.startswith("protein_id="):
                    protein_id = attr.split("=")[1]
                    break
                elif attr.startswith("ID="):
                    protein_id = attr.split("=")[1]
            if protein_id:
                records.append(
                    {
                        "protein_id": protein_id,
                        "contig": contig,
                        "start": start,
                        "end": end,
                        "strand": strand,
                    }
                )
    return pd.DataFrame(records)

In [6]:
gff_df = parse_gff("../dataset/demonstration_samples/GCF_000175755.1/genomic.gff")
gff_df = gff_df.sort_values(["contig", "start"]).reset_index(drop=True)

In [7]:
gff_df["contig"].nunique()

245

In [11]:
gff_df

Unnamed: 0,protein_id,contig,start,end,strand
0,WP_000300246.1,NZ_ACXO01000001.1,1,182,+
1,WP_000852126.1,NZ_ACXO01000001.1,299,1597,+
2,WP_001322343.1,NZ_ACXO01000001.1,1594,1917,-
3,WP_000949265.1,NZ_ACXO01000001.1,1963,3318,-
4,WP_000082974.1,NZ_ACXO01000001.1,3432,6092,-
...,...,...,...,...,...
5431,WP_000825639.1,NZ_ACXO01000247.1,118957,119178,+
5432,WP_000738579.1,NZ_ACXO01000247.1,119609,120634,+
5433,WP_000019652.1,NZ_ACXO01000247.1,120702,121883,+
5434,cds-MBRIFRIK2000_RS01000000130240,NZ_ACXO01000247.1,121893,122995,+


In [10]:
print((gff_df["protein_id"] == sorted(gff_df["protein_id"])).all())
print((gff_df["start"] == sorted(gff_df["start"])).all())
print((gff_df["end"] == sorted(gff_df["end"])).all())
print((gff_df["contig"] == sorted(gff_df["contig"])).all())
print(gff_df.equals(gff_df.sort_values(["contig", "start"])))

False
False
False
True
True


In [34]:
gff_df.to_csv(
    "../dataset/demonstration_samples/Escherichia_coli_O157_H7_str_FRIK2000/gff_df.csv",
    index=False,
)

# known segments

## generate samples

In [35]:
annotation = pd.read_parquet("../dataset/Phage_and_Satellites_Pann_Pcat_Pcol.pa")

In [36]:
# see each of what's length
# annotation_pici = annotation[annotation["what"] == "PICI"]
# annotation_cfpici = annotation[annotation["what"] == "CFPICI"]
# annotation_p4 = annotation[annotation["what"] == "P4"]
# annotation_phage = annotation[annotation["what"] == "phage"]

# see the average length of each acc in each what
# annotation_phage["acc"].value_counts().mean()
# print(f"phage: {annotation_phage['acc'].value_counts().describe()}")
# print(f"pici: {annotation_pici['acc'].value_counts().describe()}")
# print(f"cfpici: {annotation_cfpici['acc'].value_counts().describe()}")
# print(f"p4: {annotation_p4['acc'].value_counts().describe()}")

# conclusion: satellites have max length of 44

phage_gene_counts = (
    annotation[annotation["what"] == "phage"]
    .groupby("acc")["name"]
    .nunique()
    .reset_index()
)
phage_gene_counts.columns = ["acc", "gene_num"]

phage_small = phage_gene_counts[
    (phage_gene_counts["gene_num"] >= 10) & (phage_gene_counts["gene_num"] <= 50)
]
phage_small_200 = phage_small.sample(n=200, random_state=42)
annotation_phage_small_200 = annotation[annotation["acc"].isin(phage_small_200["acc"])]
annotation_phage_small_200.to_csv(
    "../dataset/demonstration_samples/known_segments/annotation_phage_small_200.csv",
    index=False,
)

In [None]:
# annotation = pd.read_parquet("../dataset/Phage_and_Satellites_Pann_Pcat_Pcol.pa")

# # id 200 random samples for each type
# pici_acc_200 = list(
#     np.random.choice(
#         annotation[annotation["what"] == "PICI"]["acc"].unique(), 200, replace=False
#     )
# )
# cf_acc_200 = list(
#     np.random.choice(
#         annotation[annotation["what"] == "CFPICI"]["acc"].unique(), 200, replace=False
#     )
# )
# p4_acc_200 = list(
#     np.random.choice(
#         annotation[annotation["what"] == "P4"]["acc"].unique(), 200, replace=False
#     )
# )
# phage_acc_200 = list(
#     np.random.choice(
#         annotation[annotation["what"] == "phage"]["acc"].unique(), 200, replace=False
#     )
# )
# all_acc_200 = pici_acc_200 + cf_acc_200 + p4_acc_200 + phage_acc_200

# # merge all the samples
# samples = annotation[annotation["acc"].isin(all_acc_200)]
# print(samples["acc"].nunique())

# samples.to_csv(
#     "../dataset/demonstration_samples/known_segments/annotation_200.csv", index=False
# )

## parse seqs

In [37]:
samples = pd.read_csv(
    "../dataset/demonstration_samples/known_segments/annotation_phage_small_200.csv"
)
samples_acc = samples["acc"].unique()
samples_protein_id = samples["name"].unique()

print(len(samples_protein_id))
print(len(samples_acc))
print(samples.shape)

with open(
    "../dataset/demonstration_samples/known_segments/proteins_phage_small_200.faa", "w"
) as f:
    for idx, row in samples.iterrows():
        f.write(f">{row['name']}\n")
        f.write(f"{row['translation']}\n")

6829
200
(6829, 10)


# sequence comparison

In [37]:
import pandas as pd
from Bio import SeqIO

In [41]:
annotation = pd.read_parquet("../dataset/Phage_and_Satellites_Pann_Pcat_Pcol.pa")

In [45]:
annotation[annotation["acc"].str.contains("STAU002_0321_00536_C001")]


Unnamed: 0,acc,name,start,end,strand,pann,pcat,pcol,translation,what
0,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0001,2,569,-1,terminase small subunit,head and packaging,,MSELTAKQARFVNEYIRTLNVTQSAIKAGYSANSAHVTGCRLLKKP...,PICI
1,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0002,565,907,-1,hypothetical protein,unknown function,,MDKKQIKDFVCDYHKRTISDVLIDDEINTDEFFSIGDENSNEWMAD...,PICI
2,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0003,909,1437,-1,spore coat protein,head and packaging,,MKLLKTKNCLYYRNGDNKLSEYQLLTQFNPAFINKKIKMCEFQIES...,PICI
3,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0004,1487,1706,-1,hypothetical protein,unknown function,,METKYELNNTKKVANAFGLNEADTNLLINAVDLDIKNNMQEISSEL...,PICI
4,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0005,1723,2302,-1,hypothetical protein,unknown function,,MKTESYFKEYNQFVIDQQKAIQELEQERNALESKIKIDKSTYKQLI...,PICI
5,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0006,2313,2655,-1,hypothetical protein,unknown function,,MQSIAEKETYHLPTEHLQVFNVIKNTSNKYITKTKILNQLGYEYNS...,PICI
6,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0007,3104,3746,-1,hypothetical protein,unknown function,,MNLETIVNQFETRAGTLLRYYTGLLEHSKVQPCCFKLYNDPFDMVY...,PICI
7,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0008,3742,4123,-1,hypothetical protein,unknown function,,MDKEQLKKYIYEYVKEYKEIPIYQLEDLFKEINHDYIGRTSITHDK...,PICI
8,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0009,4432,6142,-1,DNA primase/helicase,"DNA, RNA and nucleotide metabolism",,METGKSDVLDKIEKINKKDSALQEIIPKGYEIEHHQCGVALYQLIP...,PICI
9,STAU002_0321_00536_C001_PICI_TypeB_variant0002...,PGUTJQZF_CDS_0010,6155,7025,-1,DNA polymerase/primase,"DNA, RNA and nucleotide metabolism",,MNEIKLEYDTHVSVVHYESLDSRSFKSFSKPEWSKLVNKLSVPIEA...,PICI


In [4]:
pici_seqs = list(SeqIO.parse("../dataset/PICI_NucSequences_060723.fst", "fasta"))
len(pici_seqs)

1435

In [10]:
# find seq where id contains STAU002.0321.00536.C001
for seq in pici_seqs:
    if "STAU002.0321.00536.C001" in seq.id:
        print(seq.id)
        target_seq = seq.seq
        print(len(target_seq))
        break

STAU002.0321.00536.C001.PICI.TypeB.variant0002.Set2
10337


In [38]:
pici_metadata = pd.read_csv("../dataset/PICI_GenomicNucInfo_060723.csv")
pici_metadata[pici_metadata["PICI_ID"].str.contains("STAU002.0321.00536.C001")]

Unnamed: 0.1,Unnamed: 0,PICI_ID,Host_ID,Host_strain,Host_Ref,Genomic_region
1304,1304,STAU002.0321.00536.C001.PICI.TypeB.variant0002...,STAU002.0321.00536.C001,"Staphylococcus aureus subsp. aureus N315, comp...",GCF_000009645.1,"(2061935, 2072272)"


In [39]:
pici_metadata

Unnamed: 0.1,Unnamed: 0,PICI_ID,Host_ID,Host_strain,Host_Ref,Genomic_region
0,0,STAU002.0321.00050.C001.PICI.TypeB.variant0002...,STAU002.0321.00050.C001,Staphylococcus aureus strain HOU1444-VR chromo...,GCF_001278745.1,"(401962, 412834)"
1,1,POSP014.0321.00001.C001.PICI.TypeB.variant0001...,POSP014.0321.00001.C001,"Polaromonas sp. JS666, complete sequence.",GCF_000013865.1,"(3126147, 3132812)"
2,2,STAU002.0321.00250.C001.PICI.TypeB.variant0002...,STAU002.0321.00250.C001,Staphylococcus aureus subsp. aureus strain 80w...,GCF_003944865.1,"(814785, 826925)"
3,3,STAU002.0321.00374.C001.PICI.TypeB.variant0002...,STAU002.0321.00374.C001,Staphylococcus aureus strain UP_1150 chromosom...,GCF_009912535.1,"(2745923, 2756343)"
4,4,MYTU002.0321.00238.C001.PICI.TypeB.variant0004...,MYTU002.0321.00238.C001,Mycobacterium tuberculosis strain DKC2 isolate...,GCF_900520315.1,"(105183, 113045)"
...,...,...,...,...,...,...
1430,1430,STSA003.0321.00008.C001.PICI.TypeB.variant0002...,STSA003.0321.00008.C001,Staphylococcus saprophyticus strain UTI-056 ch...,GCF_013341435.1,"(2129, 11546)"
1431,1431,STSI001.0321.00003.C001.PICI.TypeB.variant0001...,STSI001.0321.00003.C001,"Staphylococcus simulans strain MR3, complete s...",GCF_003006055.1,"(1027781, 1037385)"
1432,1432,STSI001.0321.00004.C001.PICI.TypeB.variant0001...,STSI001.0321.00004.C001,"Staphylococcus simulans strain MR4, complete s...",GCF_003006075.1,"(1027781, 1037385)"
1433,1433,STSI001.0321.00005.C001.PICI.TypeB.variant0001...,STSI001.0321.00005.C001,"Staphylococcus simulans strain MR2, complete s...",GCF_003076375.1,"(1027882, 1037486)"


In [15]:
genome_seqs = list(
    SeqIO.parse(
        "../dataset/demonstration_samples/GCF_000009645.1/GCF_000009645.1_ASM964v1_genomic.fna",
        "fasta",
    )
)
len(genome_seqs[0].seq)

2814816

In [36]:
gff_df = pd.read_csv("../dataset/demonstration_samples/GCF_000009645.1/gff_df.csv")
gff_df_chromosome = gff_df[gff_df["contig"] == "NC_002745.2"]
gff_df_pici = gff_df_chromosome[
    (gff_df_chromosome["start"] >= 2061935) & (gff_df_chromosome["start"] <= 2072272)
]
gff_df_pici

Unnamed: 0,id,contig,start,end,strand
1960,WP_001293088.1,NC_002745.2,2061935,2062504,-
1961,WP_000358774.1,NC_002745.2,2062501,2062842,-
1962,WP_000771368.1,NC_002745.2,2062845,2063372,-
1963,WP_000448770.1,NC_002745.2,2063423,2063641,-
1964,WP_000846280.1,NC_002745.2,2063659,2064237,-
1965,WP_001190615.1,NC_002745.2,2064249,2064590,-
1966,WP_001047698.1,NC_002745.2,2065040,2065681,-
1967,WP_000356942.1,NC_002745.2,2065678,2066058,-
1968,WP_000447473.1,NC_002745.2,2066368,2068077,-
1969,WP_001002689.1,NC_002745.2,2068091,2068960,-


In [22]:
gff_df["contig"].unique()

array(['NC_002745.2', 'NC_003140.1'], dtype=object)

In [29]:
gff_df[gff_df["contig"] == "NC_002745.2"]["end"].max()

np.int64(2814602)

In [17]:
# # add up all the lengths of contigs; where the total length is the largest end
# contig_length_sum = 0
# for contig in gff_df["contig"].unique():
#     contig_df = gff_df[gff_df["contig"] == contig]
#     contig_length = contig_df["end"].max()
#     contig_length_sum += contig_length
# print(contig_length_sum)


2839061
