### Deduplicate the CIRPIN/TM align pipeline outputs

In [78]:
def count_unique_pairs(filepath):
    """Count unique pairs, removing duplicates like (A,B) and (B,A)"""
    unique_pairs = set()
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('d'):
                parts = line.split('\t')
                # Use frozenset so (A,B) and (B,A) are treated as same
                pair = frozenset([parts[0], parts[1]])
                unique_pairs.add(pair)
    return len(unique_pairs)

num_cp = count_unique_pairs('cp_pairs_scope40.tsv')
num_false = count_unique_pairs('false_pos_pairs_scope40.tsv')
num_homolog = count_unique_pairs('other_homologous_pairs_scope40.tsv')


total = num_cp + num_false + num_homolog
print(f"CP pairs: {num_cp}")
print(f"False positives: {num_false}")
print(f"Other homologs: {num_homolog}")
print(f"Total: {total}")
print(f"Should equal putative pairs: 18326")

CP pairs: 1968
False positives: 16249
Other homologs: 109
Total: 18326
Should equal putative pairs: 18326


In [76]:
def get_pairs_from_file(filepath):
    """Extract all pairs from a file"""
    pairs = {}
    with open(filepath, 'r') as f:
        for line_num, line in enumerate(f, 1):
            if line.startswith('d'):
                parts = line.split('\t')
                pair = frozenset([parts[0], parts[1]])
                pairs[pair] = line.strip()
    return pairs


In [80]:
# Get pairs from each file
cp_pairs = get_pairs_from_file('cp_pairs_scope40.tsv')
false_pairs = get_pairs_from_file('false_pos_pairs_scope40.tsv')
homolog_pairs = get_pairs_from_file('other_homologous_pairs_scope40.tsv')
print(f"CP pairs: {len(cp_pairs)}")
print(f"False positives: {len(false_pairs)}")
print(f"Other homologs: {len(homolog_pairs)}")

# Find overlaps between files
false_and_homolog = set(false_pairs.keys()) & set(homolog_pairs.keys())
false_and_homolog

CP pairs: 1968
False positives: 16583
Other homologs: 150


{frozenset({'d1uc6a1', 'd5m0yb1'}),
 frozenset({'d2illa2', 'd6udjb1'}),
 frozenset({'d1zela1', 'd2obpa1'})}

In [82]:
# Get pairs from each file
cp_pairs = get_pairs_from_file('cp_pairs_scope40.tsv')
false_pairs = get_pairs_from_file('false_pos_pairs_scope40.tsv')
homolog_pairs = get_pairs_from_file('other_homologous_pairs_scope40.tsv')
print(f"CP pairs: {len(cp_pairs)}")
print(f"False positives: {len(false_pairs)}")
print(f"Other homologs: {len(homolog_pairs)}")

# Find overlaps between files
cp_and_false = set(cp_pairs.keys()) & set(false_pairs.keys())
# Remove cp_and_false from false_pairs
for pair in cp_and_false:
    del false_pairs[pair]

cp_and_homolog = set(cp_pairs.keys()) & set(homolog_pairs.keys())
# Remove cp_and_homolog from homolog_pairs
for pair in cp_and_homolog:
    del homolog_pairs[pair]

false_and_homolog = set(false_pairs.keys()) & set(homolog_pairs.keys())
# Remove false_and_homolog from false_pairs
for pair in false_and_homolog:
    del false_pairs[pair]

all_three = set(cp_pairs.keys()) & set(false_pairs.keys()) & set(homolog_pairs.keys())


# Save cp_pairs to cp_pairs_scope40_dedup_asym
# Save false_pos_pairs_scope40_dedup_asym
# Save other_homologous_pairs_scope40_dedup_asym

# Save cp_pairs to cp_pairs_scope40_dedup_asym
with open('cp_pairs_scope40_dedup_asym.tsv', 'w') as f:
    f.write("query\ttarget\tprog_score\tcirpin_score\ttm_score\ttm_score_cp\ttm_diff\n")
    for line in cp_pairs.values():
        f.write(line + "\n")

# Save false_pos_pairs_scope40_dedup_asym
with open('false_pos_pairs_scope40_dedup_asym.tsv', 'w') as f:
    f.write("query\ttarget\tprog_score\tcirpin_score\ttm_score_cp\n")
    for line in false_pairs.values():
        f.write(line + "\n")

# Save other_homologous_pairs_scope40_dedup_asym
with open('other_homologous_pairs_scope40_dedup_asym.tsv', 'w') as f:
    f.write("query\ttarget\tprog_score\tcirpin_score\ttm_score\ttm_score_cp\ttm_diff\n")
    for line in homolog_pairs.values():
        f.write(line + "\n")

print(f"\n=== OVERLAPS ===")
print(f"In both CP and False Pos: {len(cp_and_false)}")
print(f"In both CP and Other Homolog: {len(cp_and_homolog)}")
print(f"In both False Pos and Other Homolog: {len(false_and_homolog)}")
print(f"In all three files: {len(all_three)}")

print(f"\n=== AFTER DEDUPLICATION ===")
print(f"CP pairs (kept): {len(cp_pairs)}")
print(f"False positives (after removal): {len(false_pairs)}")
print(f"Other homologs (after removal): {len(homolog_pairs)}")
print(f"Total unique: {len(cp_pairs) + len(false_pairs) + len(homolog_pairs)}")



CP pairs: 1968
False positives: 16583
Other homologs: 150

=== OVERLAPS ===
In both CP and False Pos: 331
In both CP and Other Homolog: 41
In both False Pos and Other Homolog: 3
In all three files: 0

=== AFTER DEDUPLICATION ===
CP pairs (kept): 1968
False positives (after removal): 16249
Other homologs (after removal): 109
Total unique: 18326


In [None]:
## Some structures in false positives are really true CPs; TM-align score is asymmetric and put them in the wrong bin
## Some structures that are "Other homologs" are really true CPs
## The ones in False pos and other homologs should just be in other homologs.

In [72]:
import os

q = '/home/ubuntu/scope40/pdbstyle-2.08/d2illa2.pdb'
t = '/home/ubuntu/scope40/pdbstyle-2.08/d6udjb1.pdb'

print("=== d2illa2 vs d6udjb1 ===")
output = os.popen(f'/home/ubuntu/TM_tools/TMalign {q} {t}')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

print("\n=== d6udjb1 vs d2illa2 (reversed) ===")
output = os.popen(f'/home/ubuntu/TM_tools/TMalign {t} {q}')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

=== d2illa2 vs d6udjb1 ===
TM-score= 0.62104 (if normalized by length of Chain_1, i.e., LN=95, d0=3.54)
TM-score= 0.65586 (if normalized by length of Chain_2, i.e., LN=89, d0=3.41)
(You should use TM-score normalized by length of the reference structure)

=== d6udjb1 vs d2illa2 (reversed) ===
TM-score= 0.65586 (if normalized by length of Chain_1, i.e., LN=89, d0=3.41)
TM-score= 0.62104 (if normalized by length of Chain_2, i.e., LN=95, d0=3.54)
(You should use TM-score normalized by length of the reference structure)


In [68]:
import os

q = '/home/ubuntu/scope40/pdbstyle-2.08/d2illa2.pdb'
t = '/home/ubuntu/scope40/pdbstyle-2.08/d6udjb1.pdb'

print("=== d2illa2 vs d6udjb1 ===")
output = os.popen(f'/home/ubuntu/TM_tools/TMalign {q} {t} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

print("\n=== d6udjb1 vs d2illa2 (reversed) ===")
output = os.popen(f'/home/ubuntu/TM_tools/TMalign {t} {q} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

=== d2illa2 vs d6udjb1 ===
TM-score= 0.42754 (if normalized by length of Chain_1, i.e., LN=95, d0=3.54)
TM-score= 0.44359 (if normalized by length of Chain_2, i.e., LN=89, d0=3.41)
(You should use TM-score normalized by length of the reference structure)

=== d6udjb1 vs d2illa2 (reversed) ===
TM-score= 0.65586 (if normalized by length of Chain_1, i.e., LN=89, d0=3.41)
TM-score= 0.62104 (if normalized by length of Chain_2, i.e., LN=95, d0=3.54)
(You should use TM-score normalized by length of the reference structure)


In [70]:
import os

#d2ccva1 d2illa2

q = '/home/ubuntu/scope40/pdbstyle-2.08/d5lxea_.pdb'
t = '/home/ubuntu/scope40/pdbstyle-2.08/d4cnna_.pdb'

output = os.popen(f'/home/ubuntu/TM_tools/TMalign {q} {t} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

output = os.popen(f'/home/ubuntu/TM_tools/TMalign {t} {q} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

TM-score= 0.56233 (if normalized by length of Chain_1, i.e., LN=324, d0=6.58)
TM-score= 0.71730 (if normalized by length of Chain_2, i.e., LN=245, d0=5.80)
(You should use TM-score normalized by length of the reference structure)
TM-score= 0.62526 (if normalized by length of Chain_1, i.e., LN=245, d0=5.80)
TM-score= 0.49541 (if normalized by length of Chain_2, i.e., LN=324, d0=6.58)
(You should use TM-score normalized by length of the reference structure)


In [71]:
import os

#d2ccva1 d2illa2

q = '/home/ubuntu/scope40/pdbstyle-2.08/d1qzya1.pdb'
t = '/home/ubuntu/scope40/pdbstyle-2.08/d2dk8a1.pdb'

output = os.popen(f'/home/ubuntu/TM_tools/TMalign {q} {t} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

output = os.popen(f'/home/ubuntu/TM_tools/TMalign {t} {q} -cp')
for line in output:
    if 'TM-score' in line:
        print(line.strip())

TM-score= 0.61022 (if normalized by length of Chain_1, i.e., LN=74, d0=3.03)
TM-score= 0.65178 (if normalized by length of Chain_2, i.e., LN=68, d0=2.86)
(You should use TM-score normalized by length of the reference structure)
TM-score= 0.63612 (if normalized by length of Chain_1, i.e., LN=68, d0=2.86)
TM-score= 0.59555 (if normalized by length of Chain_2, i.e., LN=74, d0=3.03)
(You should use TM-score normalized by length of the reference structure)
