# Imports

In [22]:
import pandas as pd
from pathlib import Path

In [23]:
base_dir = Path("/zdata/user-data/noam/data/p2cs/")
full_genomes_dir = base_dir / "full_genomes_new"
organism_merged_genomes_dir = base_dir / "organism_merged_genomes_new"
genome_analysis_dir = base_dir / "genome_analysis_results"
mash_analysis_dir = base_dir / "mash_analysis"
data_dir = mash_analysis_dir / "distance_matrices"  # For genome-level matrices
taxonomy_dir = base_dir / "taxonomy"
k_values = [11, 15, 19, 21]

# Load data

In [24]:
p2cs_filtered_groups_path = base_dir / "merged_p2cs_data" / "p2cs_filtered_groups.pkl"

p2cs_filtered_groups = pd.read_pickle(p2cs_filtered_groups_path)

In [25]:
p2cs_filtered_groups.head(1)

Unnamed: 0,Gene,Start,Stop,Strand,Original description,class,type,P2CS description,tm,file_name,...,organism,Gene_num,frame,proximity_group,tcs_organization,tcs_organization_int,proximity_group_size,num_cognates,aa_sequence,nt_sequence
0,Asuc_0781,841792,842496,+,two component transcriptional regulator,RR,OmpR,"Response regulator, OmpR family contains 1 Res...",0,ActsuDB_Actinobacillus_succinogenes_130Z,...,Actinobacillus succinogenes 130Z,781,1,2,Pair,2.0,2,1.0,MTKILLVDDDIELTDLLGELLSLEGFDVVTAQNGLEALEKLDDGIC...,ATGACGAAAATTTTATTAGTCGATGACGATATCGAGTTAACTGATC...


# Create & Save Fasta's

**Test Amino acid fasta file**

In [36]:
header_cols = ['Gene', 'organism', 'type', 'proximity_group_size', 'num_cognates']
data_col = 'aa_sequence'

# Create fasta files for hk's and for rr's
for class_type in ['hk', 'rr']:
    class_df = p2cs_filtered_groups[p2cs_filtered_groups['class'] == class_type.upper()].sample(200).copy()

    fasta_path = base_dir / "merged_p2cs_data" / "fasta" / f"mini_sample_{class_type}_filtered_groups.faa"
    with open(fasta_path, 'w') as f:
        for index, row in class_df.iterrows():
            header_values = [str(row[col]) for col in header_cols]
            header_str = " ".join(header_values)
            f.write(f">{header_str}\n{row[data_col]}\n")

    print(f"Fasta file for {class_type} filtered groups saved to {fasta_path}")


Fasta file for hk filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/mini_sample_hk_filtered_groups.faa
Fasta file for rr filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/mini_sample_rr_filtered_groups.faa


**Amino acid fasta files**

In [35]:
header_cols = ['Gene', 'organism', 'type', 'proximity_group_size', 'num_cognates']
data_col = 'aa_sequence'

# Create fasta files for hk's and for rr's
for class_type in ['hk', 'rr']:
    class_df = p2cs_filtered_groups[p2cs_filtered_groups['class'] == class_type.upper()]

    fasta_path = base_dir / "merged_p2cs_data" / "fasta" / f"{class_type}_filtered_groups.faa"
    with open(fasta_path, 'w') as f:
        for index, row in class_df.iterrows():
            header_values = [str(row[col]) for col in header_cols]
            header_str = " ".join(header_values)
            f.write(f">{header_str}\n{row[data_col]}\n")

    print(f"Fasta file for {class_type} filtered groups saved to {fasta_path}")


Fasta file for hk filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/hk_filtered_groups.faa
Fasta file for rr filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/rr_filtered_groups.faa


**Nucleotide fasta files**

In [34]:
header_cols = ['Gene', 'organism','type', 'proximity_group_size', 'num_cognates']
data_col = 'nt_sequence'

# Create fasta files for hk's and for rr's
for class_type in ['hk', 'rr']:
    class_df = p2cs_filtered_groups[p2cs_filtered_groups['class'] == class_type.upper()]

    fasta_path = base_dir / "merged_p2cs_data" / "fasta" / f"{class_type}_filtered_groups.fna"
    with open(fasta_path, 'w') as f:
        for index, row in class_df.iterrows():
            header_values = [str(row[col]) for col in header_cols]
            header_str = " ".join(header_values)
            f.write(f">{header_str}\n{row[data_col]}\n")

    print(f"Fasta file for {class_type} filtered groups saved to {fasta_path}")


Fasta file for hk filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/hk_filtered_groups.fna
Fasta file for rr filtered groups saved to /zdata/user-data/noam/data/p2cs/merged_p2cs_data/fasta/rr_filtered_groups.fna
