# Imports

In [1]:
import numpy as np
import os
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import wandb
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm



In [2]:
base_dir = Path("/zdata/user-data/noam/data/p2cs/")
merged_p2cs_data_dir = base_dir / "merged_p2cs_data"
embeddings_dir = base_dir / "embeddings" / "esm3-medium" / "all_embeddings"

# Load Merge and Save Dataframe

## Protein Pairs

In [None]:
# load npy file with protein embeddings
protein_embeddings = np.load(embeddings_dir / "p2cs_filtered_groups_mean_embeddings_esm3-medium-2024-08.npy")

# load dataframe with protein ids
protein_data_df = pd.read_csv(merged_p2cs_data_dir / "_p2cs_filtered_groups.csv")

# add embeddings
protein_data_df['embeddings'] = protein_embeddings.tolist()

# # save HK dataset with columns: 'Gene', 'embeddings'
# hk_dataset = protein_data_df.loc[protein_data_df['class'] == 'HK', ['Gene', 'embeddings']]
# hk_dataset.to_pickle(os.path.join(data_path, "p2cs_hk_embeddings.pkl"))
# # save RR dataset with columns: 'Gene', 'embeddings'
# rr_dataset = protein_data_df.loc[protein_data_df['class'] == 'RR', ['Gene', 'embeddings']]
# rr_dataset.to_pickle(os.path.join(data_path, "p2cs_rr_embeddings.pkl"))

In [7]:
# find duplicate rows in the dataframe and remove them
duplicates = protein_data_df[protein_data_df.duplicated(subset=['Gene'], keep=False)]
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate rows in the dataframe. Removing duplicates...")
    protein_data_df = protein_data_df.drop_duplicates(subset=['Gene'], keep='first')
else:
    print("No duplicate rows found in the dataframe.")

Found 2 duplicate rows in the dataframe. Removing duplicates...


In [11]:
# Create paired HK-RR embeddings dataset
paired_data = []

# Group by proximity_group and create pairs with progress bar
for group_id, group_df in tqdm(protein_data_df.groupby('proximity_group'), total=protein_data_df['proximity_group'].nunique(), desc="Pairing groups"):
    # Get HK and RR rows in this group
    hk_rows = group_df[group_df['class'] == 'HK']
    rr_rows = group_df[group_df['class'] == 'RR']
    
    # Create all possible HK-RR pairs within this group
    for _, hk_row in hk_rows.iterrows():
        for _, rr_row in rr_rows.iterrows():
            paired_data.append({
                'proximity_group': group_id,
                'organism': hk_row['organism'],
                'hk_gene': hk_row['Gene'],
                'rr_gene': rr_row['Gene'],
                'hk_embedding': hk_row['embeddings'],
                'rr_embedding': rr_row['embeddings']
            })

# Convert to DataFrame
paired_df = pd.DataFrame(paired_data)

# Save the paired dataset
paired_df.to_pickle(os.path.join(data_path, "p2cs_hk_rr_pairs_esm3-medium-2024-08.pkl"))

print(f"Created {len(paired_df)} HK-RR pairs from {len(protein_data_df['proximity_group'].unique())} proximity groups")
print(f"Columns: {paired_df.columns.tolist()}")

Pairing groups: 100%|██████████| 40177/40177 [00:48<00:00, 829.19it/s] 


Created 42167 HK-RR pairs from 40177 proximity groups
Columns: ['proximity_group', 'organism', 'hk_gene', 'rr_gene', 'hk_embedding', 'rr_embedding']


In [12]:
paired_df.head(6)

Unnamed: 0,proximity_group,organism,hk_gene,rr_gene,hk_embedding,rr_embedding
0,2,Actinobacillus succinogenes 130Z,Asuc_0782,Asuc_0781,"[0.0011588528286665678, 0.01809774339199066, -...","[0.007380485534667969, 0.094380684196949, -0.1..."
1,5,Actinobacillus succinogenes 130Z,Asuc_1364,Asuc_1363,"[0.021759066730737686, 0.012713533826172352, -...","[0.020951153710484505, 0.0931614488363266, -0...."
2,6,Actinobacillus succinogenes 130Z,Asuc_1720,Asuc_1721,"[-0.001734129968099296, -0.011823820881545544,...","[-0.008336775936186314, 0.03139081597328186, -..."
3,7,Burkholderia ambifaria MC40-6,BamMC406_0032,BamMC406_0031,"[0.06412381678819656, 0.0014582494040951133, -...","[-0.0009646646794863045, 0.10316631942987442, ..."
4,10,Burkholderia ambifaria MC40-6,BamMC406_0182,BamMC406_0181,"[0.08152365684509277, -0.013532615266740322, -...","[0.027939323335886, 0.09411260485649109, -0.08..."
5,14,Burkholderia ambifaria MC40-6,BamMC406_1195,BamMC406_1194,"[0.16415660083293915, 0.09197816997766495, -0....","[-0.05334537848830223, 0.07757166028022766, -0..."


In [13]:
dot_products = paired_df.apply(
    lambda row: np.dot(row['hk_embedding'], row['rr_embedding']), axis=1
)
average_dot_product = dot_products.mean()
print(f"Average dot product: {average_dot_product}")


Average dot product: 750.5228609429898


## Orphans

In [None]:
orphans_df = pd.read_pickle(merged_p2cs_data_dir / "_p2cs_orphan_data.pkl")
orphan_embeddings = np.load(embeddings_dir / "p2cs_orphans_mean_embeddings_esm3-medium-2024-08.npy")

orphans_df['embeddings'] = orphan_embeddings.tolist()

orphans_df = orphans_df.drop_duplicates(subset=['Gene', 'organism', 'nt_sequence']).reset_index(drop=True)

orphans_df.to_pickle(embeddings_dir / "p2cs_orphans_data_and_mean_embeddings_esm3-medium-2024-08.pkl")

In [19]:
orphans_df

Unnamed: 0,Gene,Start,Stop,Strand,Original description,class,type,P2CS description,tm,file_name,...,organism,Gene_num,frame,aa_sequence,nt_sequence,proximity_group,tcs_organization,tcs_organization_int,proximity_group_size,embeddings
0,Asuc_0162,191189,192874,-,hypothetical protein,HK,Classic,"Histidine kinase, Classic contains 1 HAMP,1 Hi...",2,ActsuDB_Actinobacillus_succinogenes_130Z,...,Actinobacillus succinogenes 130Z,162,-1,VNVKKSVTTRIARYLITVIIFAGIITTFALGIMVSNRSDAEQINVS...,GTGAACGTTAAAAAATCGGTGACTACGCGCATTGCCCGCTATCTGA...,0,,,,"[0.0660170167684555, 0.019658083096146584, -0...."
1,Asuc_0362,412438,413067,+,two component LuxR family transcriptional ...,RR,NarL,"Response regulator, NarL family contains 1 Res...",0,ActsuDB_Actinobacillus_succinogenes_130Z,...,Actinobacillus succinogenes 130Z,362,1,MTEKTKVLLIDDHPLMRRGIKQLIELDEIFEVVGDAGNGNDGISLA...,ATGACAGAAAAAACGAAAGTTCTATTAATTGATGATCATCCGTTAA...,1,,,,"[0.006351524963974953, 0.1104298084974289, -0...."
2,Asuc_0860,924497,926332,+,hypothetical protein,HK,Unorthodox,"Histidine kinase, Unorthodox contains 1 HisKA,...",3,ActsuDB_Actinobacillus_succinogenes_130Z,...,Actinobacillus succinogenes 130Z,860,2,MKNVRHFTQRYIDWVIKLGRIKFSVLGFLVLAAFALLTHIILSFIV...,ATGAAAAACGTCCGACATTTTACACAACGTTATATTGACTGGGTAA...,3,,,,"[-0.0009081686730496585, -0.04025004431605339,..."
3,Asuc_0879,941325,942032,-,two-component response regulator,RR,OmpR,"Response regulator, OmpR family contains 1 Res...",0,ActsuDB_Actinobacillus_succinogenes_130Z,...,Actinobacillus succinogenes 130Z,879,-2,MTTPHILVVEDETITRNTLKSIFEAEGYHVFEATDGAQMHRVLAEH...,ATGACAACGCCTCATATTCTTGTGGTTGAAGACGAAACTATTACGC...,4,,,,"[-0.011519435793161392, 0.07895831018686295, -..."
4,BamMC406_0073,83160,84644,-,integral membrane sensor signal transduction ...,HK,Classic,"Histidine kinase, Classic contains 1 2CSK_N,1 ...",2,Bura4DB_Burkholderia_ambifaria_MC40-6,...,Burkholderia ambifaria MC40-6,73,-2,MSSDPAVTTSLRRSLLRRLAAPLSMLALMSGLIAYWLAWQYTQHVI...,ATGTCTTCTGATCCGGCTGTCACCACCAGCCTGCGCCGCTCGCTGC...,8,,,,"[0.03134315088391304, 0.0006745323189534247, -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66180,Glov_1117,1170338,1172650,+,integral membrane sensor hybrid histidine ...,HK,Hybrid,"Histidine kinase, Hybrid contains 1 HAMP,1 His...",3,GeoloDB_Geobacter_lovleyi_SZ,...,Geobacter lovleyi SZ,1117,2,MKNTDMHSNQLVPEEQQQGLSGRLAALVPLTFRGRAMLFLFPMIVI...,ATGAAAAATACTGATATGCATAGCAACCAGTTGGTGCCGGAGGAAC...,103530,Orphan,1.0,2.0,"[-0.006636046338826418, 0.00786154717206955, -..."
66181,Glov_1118,1172647,1174005,+,response regulator receiver sensor signal ...,HK,Hybrid,"Histidine kinase, Hybrid contains 1 Response_r...",0,GeoloDB_Geobacter_lovleyi_SZ,...,Geobacter lovleyi SZ,1118,1,MISEGQQVTSSGERRPVILVVDDDANNLAVVRDCLVAFNYTILVAE...,ATGATTTCTGAGGGGCAGCAGGTGACTTCATCCGGTGAACGGCGGC...,103530,Orphan,1.0,2.0,"[0.028614362701773643, 0.07497210800647736, -0..."
66182,Glov_1548,1635208,1637997,+,Hpt sensor hybrid histidine kinase,HK,Unorthodox,"Histidine kinase, Unorthodox contains 1 HisKA,...",1,GeoloDB_Geobacter_lovleyi_SZ,...,Geobacter lovleyi SZ,1548,1,MQATSGAARASRLLFLLLGLFVVGFVALAAVNFGIGSLMDELEQRG...,ATGCAGGCAACTTCAGGGGCTGCACGAGCCTCCCGTTTGCTGTTCT...,103539,Orphan,1.0,3.0,"[-0.018043627962470055, 0.006508492399007082, ..."
66183,Glov_1549,1637994,1641593,+,multi-sensor hybrid histidine kinase,HK,Unorthodox,"Histidine kinase, Unorthodox contains 1 PAS_4,...",5,GeoloDB_Geobacter_lovleyi_SZ,...,Geobacter lovleyi SZ,1549,3,MTGQRLTALRSLMTGLLFGLAGAVLNWFKLELFFNVDFLFGSIATM...,ATGACTGGTCAACGGTTGACTGCACTGCGTAGCCTCATGACAGGCC...,103539,Orphan,1.0,3.0,"[0.015644753351807594, -0.004670244175940752, ..."
