In [1]:
import os
import pandas as pd
import numpy as np
import re

# Function to parse a line with PDB codes and chains
def parse_pdb_line(line):
    parts = line.split()
    try:
        code_1, code_2 = parts[1], parts[4]
        chain_1, chain_2 = 'Unknown', 'Unknown'  # Default values
        if '_' in code_1:
            chain_1 = code_1.split('_')[1].replace('.pdb', '')
            code_1 = code_1.split('_')[0]
        if '_' in code_2:
            chain_2 = code_2.split('_')[1].replace('.pdb', '')
            code_2 = code_2.split('_')[0]
        return code_1, chain_1, code_2, chain_2
    except IndexError as e:
        print(f"Error processing line: {line}")
        raise e

# Function to extract values from the line with P-value, Afp-num, etc.
def parse_values_line(line):
    p_value = float(re.search(r'P-value (\S+)', line).group(1))
    afp_num = int(re.search(r'Afp-num (\d+)', line).group(1))
    identity = float(re.search(r'Identity (\S+%)', line).group(1).strip('%'))
    similarity = float(re.search(r'Similarity (\S+%)', line).group(1).strip('%'))
    return p_value, afp_num, identity, similarity

# Read the file and process it
data = []
with open('../data/allpair.aln', 'r') as file:
    for line in file:
        if line.startswith('Align'):
            #print(line)
            code_1, chain_1, code_2, chain_2 = parse_pdb_line(line)
        if 'P-value' in line:
            p_value, afp_num, identity, similarity = parse_values_line(line)
            data.append([code_1, chain_1, code_2, chain_2, p_value, afp_num, identity, similarity])

# Create a DataFrame
df = pd.DataFrame(data, columns=['code_1', 'chain_1', 'code_2', 'chain_2', 'P-value', 'Afp-num', 'Identity (%)', 'Similarity (%)'])

# Display the DataFrame
print(df)

      code_1 chain_1 code_2 chain_2  P-value  Afp-num  Identity (%)  \
0       1A0F       A   1A23       A   0.1720    13385          4.48   
1       1A0F       A   1A43       A   0.1640     6081          1.09   
2       1A0F       A   1A5E       A   0.5270    10403          7.25   
3       1A0F       A   1A7V       A   0.0110    11388          7.08   
4       1A0F       A   1AAR       A   0.8280     3669         13.11   
...      ...     ...    ...     ...      ...      ...           ...   
54278   6G4B       A   6TQ3       A   0.3680    35101          4.04   
54279   6G4B       A   8TIM       A   0.2710    35907          5.88   
54280   6JHM       A   6TQ3       A   0.2810    41764          3.87   
54281   6JHM       A   8TIM       A   0.6040    43159          3.34   
54282   6TQ3       A   8TIM       A   0.0586    19171          4.71   

       Similarity (%)  
0               10.45  
1                6.52  
2               13.99  
3               18.58  
4               29.51  
...

In [2]:
fireprot = list(pd.read_csv('../data/fireprot_mapped_preds.csv')['code'].unique())
s461 = list(pd.read_csv('../data/s461_mapped_preds.csv')['code'].unique())
s669 = list(pd.read_csv('../data/s669_mapped_preds.csv')['code'].unique())
q3421 = list(pd.read_csv('../data/Q3421_mapped_preds.csv')['code'].unique())
ssym = list(pd.read_csv('../data/Ssym_mapped_preds.csv')['wt_code'].unique())

  fireprot = list(pd.read_csv('../data/fireprot_mapped_preds.csv')['code'].unique())


In [3]:
datasets = ['fireprot', 's461', 's669', 'Q3421', 'Ssym'] #'s669', 
df['datasets_1'] = [[] for _ in range(len(df))]
df['datasets_2'] = [[] for _ in range(len(df))]

# Iterate over each dataset and update the DataFrame
for name, codes in zip(datasets, [fireprot, s461, s669, q3421, ssym]): #s669,
    for i in df.index:
        if df.at[i, 'code_1'] in codes:
            df.at[i, 'datasets_1'].append(name)
        if df.at[i, 'code_2'] in codes:
            df.at[i, 'datasets_2'].append(name)

df['datasets_1'] = df['datasets_1'].astype(str)
df['datasets_2'] = df['datasets_2'].astype(str)
df = df.loc[(df['datasets_1'].astype(str)!='[]') & (df['datasets_2'].astype(str)!='[]')]
#df.sort_values('Similarity (%)', ascending=False).head(50)

df['code_1'] = df['code_1'] + '_' + df['chain_1']
df['code_2'] = df['code_2'] + '_' + df['chain_2']
df

Unnamed: 0,code_1,chain_1,code_2,chain_2,P-value,Afp-num,Identity (%),Similarity (%),datasets_1,datasets_2
0,1A0F_A,A,1A23_A,A,0.1720,13385,4.48,10.45,"['s461', 's669']","['fireprot', 'Q3421']"
1,1A0F_A,A,1A43_A,A,0.1640,6081,1.09,6.52,"['s461', 's669']","['fireprot', 'Q3421']"
2,1A0F_A,A,1A5E_A,A,0.5270,10403,7.25,13.99,"['s461', 's669']","['fireprot', 'Q3421']"
3,1A0F_A,A,1A7V_A,A,0.0110,11388,7.08,18.58,"['s461', 's669']",['s669']
4,1A0F_A,A,1AAR_A,A,0.8280,3669,13.11,29.51,"['s461', 's669']","['fireprot', 'Q3421']"
...,...,...,...,...,...,...,...,...,...,...
54278,6G4B_A,A,6TQ3_A,A,0.3680,35101,4.04,12.11,['fireprot'],['fireprot']
54279,6G4B_A,A,8TIM_A,A,0.2710,35907,5.88,16.08,['fireprot'],['fireprot']
54280,6JHM_A,A,6TQ3_A,A,0.2810,41764,3.87,9.82,['fireprot'],['fireprot']
54281,6JHM_A,A,8TIM_A,A,0.6040,43159,3.34,7.90,['fireprot'],['fireprot']


In [4]:
from collections import defaultdict

def find_cluster(protein, assigned_clusters, threshold=0.01):
    for cluster in assigned_clusters:
        if all(similarity_matrix.at[protein, member] <= threshold for member in cluster):
            return cluster
    return None

for name, codes in zip(datasets, [fireprot, s461, s669, q3421, ssym]):
    df_cur = df.copy(deep=True).loc[df['datasets_1'].astype(str).str.contains(name)]
    df_cur = df_cur.loc[df['datasets_2'].astype(str).str.contains(name)]
    #df_cur = df_cur.loc[df['Similarity (%)']>50]
    # Create a list of all unique codes
    all_codes = set(df_cur['code_1']).union(set(df_cur['code_2']))

    # Pivot to create a similarity matrix
    similarity_matrix = df_cur.pivot(index='code_1', columns='code_2', values='P-value')

    # Reindex the DataFrame to include all codes in both rows and columns
    similarity_matrix = similarity_matrix.reindex(index=all_codes, columns=all_codes)

    # Fill NaN values with 0 and make the matrix symmetric
    similarity_matrix = similarity_matrix.fillna(0)
    similarity_matrix = similarity_matrix + similarity_matrix.T - similarity_matrix.multiply(similarity_matrix.T.gt(0))

    # Assign proteins to clusters
    clusters = defaultdict(list)
    for protein in similarity_matrix.index:
        cluster = find_cluster(protein, clusters.values())
        if cluster is not None:
            cluster.append(protein)
        else:
            clusters[len(clusters)].append(protein)

    # Convert the clusters dictionary to a list for better readability
    cluster_list = list(clusters.values())

    print(name, "protein clusters based on similarity:")
    print(len(cluster_list))
    print(cluster_list)

    data = pd.read_csv(f'../data/{name}_mapped_preds.csv', index_col=0)
    if name == 's461':
        data['code'] = data.index.str[:4]
        
    data['cluster'] = 0
    i = 0
    for clus in cluster_list:
        i += 1
        for code_ in clus:
            code = code_[:4]
            chain = code_[-1]
            data.loc[(data['code']==code)&(data['chain']==chain), 'cluster'] = i
    data.to_csv(f'../data/{name}_mapped_preds_clusters.csv')
            

fireprot protein clusters based on similarity:
122
[['1IET_A', '1B5M_A', '1CYO_A'], ['1LRP_A', '1BVC_A', '1AJ3_A'], ['1LS4_A', '1ROP_A', '1HME_A', '1YYX_A'], ['1ADO_A', '1LUC_A', '1I4N_A', '1BTM_A', '1WQ5_A'], ['1IO2_A'], ['1TIN_A', '2CI2_I', '1ACB_I'], ['1RTB_A', '1E21_A', '1ONC_A'], ['1BCX_A', '1OLR_A', '1H8V_A', '3WP4_A'], ['2TRX_A', '1H7M_A', '1QGV_A'], ['3PG0_A', '1IOB_A', '2AFG_A', '1BFG_A'], ['1HYN_P'], ['1DPM_A', '2O9P_A', '1TPE_A', '1TUX_A', '2WSY_A'], ['1LVE_A', '1TEN_A', '2IMM_A', '1WIT_A', '1TIT_A', '1HNG_A'], ['1YCC_A', '1C52_A', '451C_A', '1YEA_A', '1AKK_A', '1YNR_A'], ['2CBR_A', '1IFB_A', '2HMB_A', '1QJP_A', '1RBP_A', '1B0O_A'], ['1KCQ_A'], ['2CHF_A', '1GUY_C', '6TQ3_A'], ['1SHF_A', '1FMK_A'], ['1CLW_A'], ['1AYE_A'], ['1AQH_A'], ['1KA6_A'], ['1RRO_A', '1IGV_A', '1UWO_A', '1RTP_1'], ['1CYC_A', '1C2R_A'], ['1RIS_A', '1HZ6_A', '2HPR_A', '1APS_A'], ['3SSI_A'], ['1ARR_A', '1P2P_A', '1BP2_A'], ['1EL1_A', '1HFZ_A', '1HFY_A', '4LYZ_A', '1LZ1_A'], ['1BAH_A'], ['1SCE_A', '1DKT_A']

  data = pd.read_csv(f'../data/{name}_mapped_preds.csv', index_col=0)


s461 protein clusters based on similarity:
38
[['1DIV_A', '1BFM_A', '2ZTA_A'], ['3DV0_I'], ['1O6X_A'], ['1BNL_A'], ['2C9Q_A'], ['1BA3_A'], ['1JL9_A'], ['1G3P_A'], ['1R2Y_A'], ['2N7Z_A'], ['2NTE_A'], ['5OAQ_A', '3L15_A'], ['1XXN_A'], ['5JXB_A'], ['1FT8_A'], ['1LVM_A'], ['1GUA_B', '1FRD_A', '1FXA_A'], ['1H0X_A'], ['1J8I_A'], ['2WQG_A'], ['2PTL_A'], ['1L6H_A'], ['1JLV_A', '1A0F_A'], ['2LTB_A'], ['1ITM_A', '3D3B_A'], ['2HBB_A'], ['2ARF_A'], ['1N88_A'], ['3C2I_A'], ['1IV7_A', '3MON_B', '1IV9_A'], ['4BUQ_A'], ['2M5S_A'], ['4HE7_A'], ['3BN0_A'], ['3S4M_A', '1EKG_A'], ['2H3F_A'], ['1NM1_A'], ['1IOJ_A']]
s669 protein clusters based on similarity:
63
[['1OSI_A'], ['1BNL_A'], ['1JL9_A'], ['4YEE_A', '4YEF_A'], ['1R2Y_A'], ['2N7Z_A', '2MPC_A'], ['1HCQ_A', '1GLU_A'], ['2NTE_A'], ['5VP3_A', '3S92_A', '2DVV_A'], ['1FH5_H', '2CLR_B'], ['5JXB_A', '1D5G_A', '3K82_A'], ['1FT8_A'], ['1LVM_A'], ['1BFM_A', '3O39_A', '2ZTA_A'], ['1PFL_A'], ['1J8I_A'], ['1PRG_A', '3D3B_A', '1A7V_A'], ['2PTL_A', '2BJD_A'], ['1L

In [5]:
for name1 in datasets:
    for name2 in datasets:
        if name1 != name2:
            ds1 = pd.read_csv(f'../data/{name1}_mapped_preds_clusters.csv', index_col=0)
            ds2 = pd.read_csv(f'../data/{name2}_mapped_preds_clusters.csv', index_col=0)
            overlap = df.loc[((df['datasets_1'].str.contains(name1)) & (df['datasets_2'].str.contains(name2)) | (df['datasets_1'].str.contains(name2)) & (df['datasets_2'].str.contains(name1))) & (df['Identity (%)']>25)]
            overlapping_codes = list(overlap['code_1'].str[:4].unique()) + list(overlap['code_2'].str[:4].unique())
            overlapping_codes += list(set(ds1['code'].unique()).intersection(set(ds2['code'].unique())))
            ds1[f'{name2}_cluster'] = False
            ds2[f'{name1}_cluster'] = False
            ds1.loc[ds1['code'].isin(overlapping_codes), f'{name2}_cluster'] = True
            ds2.loc[ds2['code'].isin(overlapping_codes), f'{name1}_cluster'] = True
            ds1.to_csv(f'../data/{name1}_mapped_preds_clusters.csv')
            ds2.to_csv(f'../data/{name2}_mapped_preds_clusters.csv')

  ds1 = pd.read_csv(f'../data/{name1}_mapped_preds_clusters.csv', index_col=0)
  ds1 = pd.read_csv(f'../data/{name1}_mapped_preds_clusters.csv', index_col=0)
  ds1 = pd.read_csv(f'../data/{name1}_mapped_preds_clusters.csv', index_col=0)
  ds1 = pd.read_csv(f'../data/{name1}_mapped_preds_clusters.csv', index_col=0)
  ds2 = pd.read_csv(f'../data/{name2}_mapped_preds_clusters.csv', index_col=0)
  ds2 = pd.read_csv(f'../data/{name2}_mapped_preds_clusters.csv', index_col=0)
  ds2 = pd.read_csv(f'../data/{name2}_mapped_preds_clusters.csv', index_col=0)
  ds2 = pd.read_csv(f'../data/{name2}_mapped_preds_clusters.csv', index_col=0)


In [6]:
data = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
data = data.iloc[:, :3]
data.columns = ['source', 'target', 'identity']
data = data.loc[data['identity']>0.25]
data = data.loc[data['source']!=data['target']]
all_codes = set(data['source']).union(set(data['target']))

tmp = pd.read_csv(f'../data/Q3421_mapped.csv'.replace('.csv', '_preds_clusters.csv'), index_col=0)
tmp = tmp[[c for c in tmp.columns if not 'overlaps' in c]]
tmp.to_csv(f'../data/Q3421_mapped.csv'.replace('.csv', '_preds_clusters.csv'))

tmp = pd.read_csv(f'../data/fireprot_mapped.csv'.replace('.csv', '_preds_clusters.csv'), index_col=0)
tmp = tmp[[c for c in tmp.columns if not 'overlaps' in c]]
tmp.to_csv(f'../data/fireprot_mapped.csv'.replace('.csv', '_preds_clusters.csv'))

id_table = pd.DataFrame()
homo_table = pd.DataFrame()

for file1 in ['korpm_mapped.csv', 'rosetta_mapped.csv', 'tsuboyama_mapped.csv', 'fireprot_mapped.csv', 'Q3421_mapped.csv']: #'fireprot_mapped.csv', 
    df_train = pd.read_csv(f'../data/{file1}', index_col=0)
    if file1 == 'fireprot_mapped.csv':
        df_train['position'] = df_train['position'].fillna(-100000).astype(int)
        df_train['uid2'] = df_train['code'] + '_' + df_train['position'].astype(str) + df_train['mutation']
        df_train = df_train.reset_index()
        df_train = df_train.groupby('uid2').first()
    train_codes = set(df_train['code'])
    name1 = file1.split('_')[0]
    for file2 in ['fireprot_mapped.csv', 'Q3421_mapped.csv', 's461_mapped_preds.csv', 'Ssym_mapped.csv']:
        c = 'code' if not file2 == 'Ssym_mapped.csv' else 'wt_code'
        name2 = file2.split('_')[0]
        if file1 != file2:
            overlap = set()
            df_test = pd.read_csv(f'../data/{file2}', index_col=0)
            if file2 == 'fireprot_mapped.csv':
                df_test['position'] = df_test['position'].fillna(-100000).astype(int)
                df_test['uid2'] = df_test['code'] + '_' + df_test['position'].astype(str) + df_test['mutation']
                df_test = df_test.reset_index()
                df_test = df_test.groupby('uid2').first()
            #print(len(df_train), len(df_test))
            id_table.at[name1, name2] = len(df_train.join(df_test[[]], how='inner'))
            test_codes = set(df_test['code'])
            #cc_test = df_test.loc[df_test['code'].isin(overlap_codes)]
            for code in train_codes:
                if code in test_codes:
                    overlap.add(code)
                else:
                    odf = data.loc[(data['source']==code)|(data['target']==code)]
                    odf = odf.loc[data['source'].isin(test_codes)|data['target'].isin(test_codes)]
                    #print(odf)
                    if len(odf) > 0:
                        overlap.add(code)
            #print(name1, name2, overlap)
            homo_table.at[name1, name2] = len(df_test.loc[df_test[c].isin(overlap)])
            #print(len(df_test.loc[df_test['code'].isin(overlap)]))

            if name1 in ['fireprot', 'Q3421']:
            #if False:
                df_train[f'overlaps_{name2}'] = False
                df_train.loc[df_train['code'].isin(overlap), f'overlaps_{name2}'] = True
                tmp = pd.read_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'), index_col=0)
                if name1 == 'fireprot':
                    df_train = df_train.reset_index().set_index('uid')
                tmp = tmp.join(df_train[[f'overlaps_{name2}']])

                tmp.to_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'))

FileNotFoundError: [Errno 2] No such file or directory: '../data/all_seq_ids.tsv'

In [None]:
from Bio import pairwise2
from Bio.Seq import Seq

# Example sequences - replace these with your actual sequences
seq1 = Seq('GDVAKGKKTFVQKCAQCHTVENGGKHKVGPNLWGLFGRKTGQAEGYSYTDANKSKGIVWNENTLMEYLENPKKYIPGTKMIFAGIKKKGERQDLVAYLKSATS')
seq2 = Seq('AKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFGRHSGQVKGYSYTDANINKNVKWDEDSMSEYLTNPKKYIPGTKMAFAGLKKEKDRNDLITYMTKAAK')

# Perform a global alignment
alignments = pairwise2.align.globalms(seq1, seq2, 2, 0.5, -0.5, -0.1)

# Take the first alignment
alignment = alignments[0]
aligned_seq1, aligned_seq2, score, begin, end = alignment

# Calculate percent identity
matches = sum(res1 == res2 for res1, res2 in zip(aligned_seq1, aligned_seq2))
percent_identity = (matches / len(aligned_seq1)) * 100

# Print the alignment and percent identity
print("Alignment:")
print(aligned_seq1)
print(aligned_seq2)
print(f"Percent Identity: {percent_identity:.2f}%")

CYC = 'GDVAKGKKTFVQKCAQCHTVENGGKHKVGPNLWGLFGRKTGQAEGYSYTDANKSKGIVWNENTLMEYLENPKKYIPGTKMIFAGIKKKGERQDLVAYLKSATS'
YEA = 'AKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFGRHSGQVKGYSYTDANINKNVKWDEDSMSEYLTNPKKYIPGTKMAFAGLKKEKDRNDLITYMTKAAK'

Alignment:
---GDVAK-G--KK--T-FVQKCAQCHTVENGGKHKVGPNLWGLFGRKTGQAEGYSYTDANKSKGIV-WNENTLM-EYLENPKKYIPGTKMIFAGIKKKGERQDLVAYLKSATS
AKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFGRHSGQVKGYSYTDANINK-NVKWDE-DSMSEYLTNPKKYIPGTKMAFAGLKKEKDRNDLITYMTKAAK
Percent Identity: 55.26%




In [None]:
    remap_direction = {'dir': 'Direct Mutations', 'inv': 'Inverse Mutations', 'combined': 'Both Directions'}
    remap_names = {'esmif_monomer': 'ESM-IF(M)', 'esmif_multimer': 'ESM-IF',
                   'mpnn_mean': 'ProteinMPNN_mean', 'msa_transformer_mean': 'MSA-T_mean', 'tranception': 'Tranception', 'esm1v_mean': 'ESM-1V_mean',
                   'mif': 'MIF', 'mifst': 'MIF-ST', 'monomer_ddg': 'Ros_ddG_monomer', 'cartesian_ddg': 'Ros_Cart_ddG', 
                   'delta_kdh': 'Δ hydrophobicity', 'delta_vol': 'Δ volume', 'SOL_ACC': 'SASA', 'korpm': 'KORPM'}

In [None]:
import os
import pandas as pd
import numpy as np
import re
import networkx as nx
import community as community_louvain

In [None]:
ros_df = pd.read_csv('../data/ES_PSTAB_SV_D22_F1_kellogg_mutations.csv', sep='\t')
# Splitting the 'Mutations' column
mutations_split = ros_df['Mutations'].str.split(' ', expand=True).iloc[:, :4]
print(mutations_split)

# Renaming the columns
mutations_split.columns = ['chain', 'wild_type', 'position', 'mutation']

# Concatenating with the original DataFrame
ros_df = pd.concat([ros_df.drop('Mutations', axis=1), mutations_split], axis=1)
ros_df = ros_df.rename({'PDBFileID': 'code'}, axis=1)
ros_df['mutation'] = ros_df['mutation'].str[0]
ros_df.to_csv('../data/rosetta.csv')
ros_df

      0  1    2  3
0     A  G   44  S
1     A  A  120  M
2     A  A  116  N
3     A  A  122  Q
4     A  A  123  Q
...  .. ..  ... ..
1205  A  I   81  T
1206  A  I   81  V
1207  A  A   82  G
1208  A  V   95  A
1209  A  F   97  A

[1210 rows x 4 columns]


Unnamed: 0,code,DDG,chain,wild_type,position,mutation
0,107L,-0.530,A,G,44,S
1,160L,-0.200,A,A,120,M
2,161L,0.170,A,A,116,N
3,162L,-0.240,A,A,122,Q
4,163L,-0.220,A,A,123,Q
...,...,...,...,...,...,...
1205,5AZU,2.510,A,I,81,T
1206,5AZU,0.155,A,I,81,V
1207,5AZU,1.673,A,A,82,G
1208,5AZU,0.956,A,V,95,A


In [None]:
korpm_df = pd.read_csv('../data/Id25c03_1merNCLB.txt', sep=' ', header=None)
korpm_df = korpm_df.rename({0: 'code', 1:'mutant', 2:'ddG', 3:'pos2'}, axis=1)
korpm_df['wild_type'] = korpm_df['mutant'].str[0]
korpm_df['chain'] = korpm_df['mutant'].str[1]
korpm_df['position'] = korpm_df['mutant'].str[2:-1]
korpm_df['mutation'] = korpm_df['mutant'].str[-1]
korpm_df.to_csv('../data/korpm.csv')

In [None]:
all_seqs = set()
for file in ['korpm_mapped.csv', 'rosetta_mapped.csv', 'tsuboyama_mapped.csv', 'fireprot_mapped.csv', 'Q3421_mapped.csv', 's461_mapped_preds.csv', 'Ssym_mapped.csv']:
    print(file)
    df = pd.read_csv(f'../data/{file}', index_col=0)
    code_ = 'code' if not file == 'Ssym_mapped.csv' else 'wt_code'
    for code, group in df.groupby(code_):
        row = group.head(1).iloc[0]
        wt = row['wild_type']
        try:
            pos = int(row['position'])
        except:
            continue
        mut = row['mutation']
        assert row['pdb_ungapped'][pos-1] == mut, uid
        wt_seq = list(row['pdb_ungapped'])
        wt_seq[pos-1] = wt
        wt_seq = ''.join(wt_seq)
        all_seqs.add((code, wt_seq))

print(len(all_seqs))

korpm_mapped.csv
rosetta_mapped.csv
tsuboyama_mapped.csv
fireprot_mapped.csv
Q3421_mapped.csv
s461_mapped_preds.csv
Ssym_mapped.csv
608


In [None]:
with open('../data/all_seqs.fasta', 'w') as f:
    for seq in list(all_seqs):
        code = seq[0]
        seq = seq[1]
        f.write(f'>{code}\n')
        f.write(seq + '\n')

In [None]:
ids = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
ids.sort_values(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
946,2O9P,4ZLU,0.009,13,12,0,133,145,294,306,3.095000e+02,17
5569,6BQG,1JU3,0.010,12,11,0,7,18,225,236,1.127000e+02,18
23,1H0C,1QGD,0.010,11,10,0,334,344,143,153,2.011000e+02,17
6133,1AMQ,6BQG,0.010,7,6,0,12,18,145,151,8.298000e+02,15
3802,1TDJ,1FEP,0.012,7,6,0,449,455,343,349,1.507000e+02,18
...,...,...,...,...,...,...,...,...,...,...,...,...
3668,2HBB,1DIV,1.000,51,0,0,1,51,1,51,8.372000e-30,103
3669,1ANK,1ANK,1.000,214,0,0,1,214,1,214,2.998000e-140,430
3678,1E6H,1E6H,1.000,61,0,0,1,61,1,61,1.245000e-39,132
1872,1BFM,1BFM,1.000,69,0,0,1,69,1,69,1.362000e-39,132


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain

# Load MMSeqs2 output data
data = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
data = data.iloc[:, :3]
data.columns = ['Seq1', 'Seq2', 'Identity']

# Create a weighted graph
G = nx.Graph()

# Add all unique sequences as nodes
all_sequences = set(data['Seq1']).union(set(data['Seq2']))
G.add_nodes_from(all_sequences)

# Define your similarity threshold
similarity_threshold = 0.5  # Adjust this as needed

# Add edges for pairs above the similarity threshold
for _, row in data.iterrows():
    if row['Identity'] > similarity_threshold and row['Identity'] < 0.99:
        G.add_edge(row['Seq1'], row['Seq2'], weight=row['Identity'])

print(G.number_of_nodes())

# Use a spring layout to reflect closeness based on weights
pos = nx.spring_layout(G, weight='weight', k=0.001)

# Draw the graph
plt.figure(figsize=(50, 40))
nx.draw(G, pos, with_labels=False, node_size=2, font_size=8)
plt.title('Network Graph Reflecting Sequence Identity Closeness')
plt.show()

607


AttributeError: module 'scipy.sparse' has no attribute 'coo_array'

In [None]:
data.loc[data['identity']>0.75]

Unnamed: 0,source,target,identity
0,8TIM,8TIM,1.000
1,8TIM,1HTI,0.886
12,5T43,5T43,1.000
17,5OAO,5OAO,1.000
18,1H0C,1H0C,1.000
...,...,...,...
7791,2PR5,2PR5,1.000
7798,1QLP,1QLP,1.000
7808,2CPP,2CPP,1.000
7832,2JUC,2JUC,1.000


In [None]:
data = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
data.loc[data[0]==data[1]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,8TIM,8TIM,1.0,247,0,0,1,247,1,247,4.664000e-168,512
12,5T43,5T43,1.0,158,0,0,1,158,1,158,4.664000e-106,329
17,5OAO,5OAO,1.0,73,0,0,1,73,1,73,7.853000e-46,150
18,1H0C,1H0C,1.0,386,0,0,1,386,1,386,3.477000e-264,798
29,5AZU,5AZU,1.0,128,0,0,1,128,1,128,2.952000e-86,270
...,...,...,...,...,...,...,...,...,...,...,...,...
7791,2PR5,2PR5,1.0,127,0,0,1,127,1,127,1.245000e-84,265
7798,1QLP,1QLP,1.0,372,0,0,1,372,1,372,8.949000e-248,750
7808,2CPP,2CPP,1.0,405,0,0,1,405,1,405,4.401000e-280,845
7832,2JUC,2JUC,1.0,55,0,0,1,55,1,55,1.231000e-35,120


In [None]:
data = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
data = data.iloc[:, :3]
data.columns = ['source', 'target', 'identity']
data = data.loc[data['identity']>0.75]
data = data.loc[data['source']!=data['target']]
data.set_index('source').to_csv('../data/cytoscape_edges.csv')
nodes = set(data['source']).union(set(data['target']))
nodes = pd.DataFrame(data=list(nodes))
nodes.columns = ['code']
nodes[['fireprot', 'q3421', 'ssym', 's461', 'korpm', 'rosetta', 'tsuboyama']] = 0
nodes

Unnamed: 0,code,fireprot,q3421,ssym,s461,korpm,rosetta,tsuboyama
0,2WSY,0,0,0,0,0,0,0
1,2K28,0,0,0,0,0,0,0
2,5Z2S,0,0,0,0,0,0,0
3,4BLM,0,0,0,0,0,0,0
4,1EL1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
393,2LUM,0,0,0,0,0,0,0
394,2M0Y,0,0,0,0,0,0,0
395,2LJ3,0,0,0,0,0,0,0
396,1SCE,0,0,0,0,0,0,0


In [None]:
for file1 in ['korpm_mapped.csv', 'rosetta_mapped.csv', 'tsuboyama_mapped.csv', 'fireprot_mapped.csv', 'Q3421_mapped.csv', 's461_mapped_preds.csv', 'Ssym_mapped.csv']:
    col = file1.split('_')[0].lower()
    df = pd.read_csv(f'../data/{file1}', index_col=0)
    codes = set(df['code'])
    for i, row in nodes.iterrows():
        if row['code'] in codes:
            nodes.at[i, col] = 1
nodes.to_csv('../data/cytoscape_nodes.csv')

In [None]:
data = pd.read_csv('../data/all_seq_ids.tsv', sep='\t', header=None)
data = data.iloc[:, :3]
data.columns = ['source', 'target', 'identity']
data = data.loc[data['identity']>0.25]
data = data.loc[data['source']!=data['target']]
all_codes = set(data['source']).union(set(data['target']))

tmp = pd.read_csv(f'../data/Q3421_mapped.csv'.replace('.csv', '_preds_clusters.csv'), index_col=0)
tmp = tmp[[c for c in tmp.columns if not 'overlaps' in c]]
tmp.to_csv(f'../data/Q3421_mapped.csv'.replace('.csv', '_preds_clusters.csv'))

tmp = pd.read_csv(f'../data/fireprot_mapped.csv'.replace('.csv', '_preds_clusters.csv'), index_col=0)
tmp = tmp[[c for c in tmp.columns if not 'overlaps' in c]]
tmp.to_csv(f'../data/fireprot_mapped.csv'.replace('.csv', '_preds_clusters.csv'))

id_table = pd.DataFrame()
homo_table = pd.DataFrame()

for file1 in ['korpm_mapped.csv', 'rosetta_mapped.csv', 'tsuboyama_mapped.csv', 'fireprot_mapped.csv', 'Q3421_mapped.csv']: #'fireprot_mapped.csv', 
    df_train = pd.read_csv(f'../data/{file1}', index_col=0)
    if file1 == 'fireprot_mapped.csv':
        df_train['position'] = df_train['position'].fillna(-100000).astype(int)
        df_train['uid2'] = df_train['code'] + '_' + df_train['position'].astype(str) + df_train['mutation']
        df_train = df_train.reset_index()
        df_train = df_train.groupby('uid2').first()
    train_codes = set(df_train['code'])
    name1 = file1.split('_')[0]
    for file2 in ['fireprot_mapped.csv', 'Q3421_mapped.csv', 's461_mapped_preds.csv', 'Ssym_mapped.csv']:
        c = 'code' if not file2 == 'Ssym_mapped.csv' else 'wt_code'
        name2 = file2.split('_')[0]
        if file1 != file2:
            overlap = set()
            df_test = pd.read_csv(f'../data/{file2}', index_col=0)
            if file2 == 'fireprot_mapped.csv':
                df_test['position'] = df_test['position'].fillna(-100000).astype(int)
                df_test['uid2'] = df_test['code'] + '_' + df_test['position'].astype(str) + df_test['mutation']
                df_test = df_test.reset_index()
                df_test = df_test.groupby('uid2').first()
            #print(len(df_train), len(df_test))
            id_table.at[name1, name2] = len(df_train.join(df_test[[]], how='inner'))
            test_codes = set(df_test['code'])
            #cc_test = df_test.loc[df_test['code'].isin(overlap_codes)]
            for code in train_codes:
                if code in test_codes:
                    overlap.add(code)
                else:
                    odf = data.loc[(data['source']==code)|(data['target']==code)]
                    odf = odf.loc[data['source'].isin(test_codes)|data['target'].isin(test_codes)]
                    #print(odf)
                    if len(odf) > 0:
                        overlap.add(code)
            #print(name1, name2, overlap)
            homo_table.at[name1, name2] = len(df_test.loc[df_test[c].isin(overlap)])
            #print(len(df_test.loc[df_test['code'].isin(overlap)]))

            if name1 in ['fireprot', 'Q3421']:
            #if False:
                df_train[f'overlaps_{name2}'] = False
                df_train.loc[df_train['code'].isin(overlap), f'overlaps_{name2}'] = True
                tmp = pd.read_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'), index_col=0)
                if name1 == 'fireprot':
                    df_train = df_train.reset_index().set_index('uid')
                tmp = tmp.join(df_train[[f'overlaps_{name2}']])

                tmp.to_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'))

  tmp = pd.read_csv(f'../data/fireprot_mapped.csv'.replace('.csv', '_preds_clusters.csv'), index_col=0)
  tmp = pd.read_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'), index_col=0)
  tmp = pd.read_csv(f'../data/{file1}'.replace('.csv', '_preds_clusters.csv'), index_col=0)


In [None]:
id_table

Unnamed: 0,fireprot,Q3421,s461,Ssym
korpm,1394.0,1050.0,141.0,0.0
rosetta,619.0,908.0,4.0,124.0
tsuboyama,1022.0,106.0,135.0,0.0
fireprot,,2244.0,30.0,112.0
Q3421,2244.0,,50.0,205.0


In [None]:
homo_table

Unnamed: 0,fireprot,Q3421,s461,Ssym
korpm,4245.0,2897.0,360.0,596.0
rosetta,3152.0,1928.0,11.0,554.0
tsuboyama,1031.0,130.0,163.0,0.0
fireprot,,3310.0,71.0,382.0
Q3421,4896.0,,71.0,618.0
