In [1]:
# load all the packages and functions 
import pandas as pd
import os
from get_integrated_homologs import get_required_df
from process_tmalign_data import get_tm_aln_df
from process_data_tmaln_duplicates import ProcessData

'''
1. get_required_df:
takes two arguments:  
    1. reference list 
    2. all paralogs from integrated homology search found in data
Outputs 

2. get_tm_aln_df 
    Input: tm_align file path 

    Output:TM align information as dataframe 

3. ProcessData
    Input: integrated homolog dataframe from 1, tm-aligned dataframe from 2, and reference list from 1
    Output: merge_data --> merges tm-align information to the original data,
        filter_duplicated_pairs --> removes the duplicates treating the (A,B) and (B, A) pairs identitical 
        filter_data --> filters pairs with tm-scores < 0.5 bidirectionally 
        get_novel_list --> gets novel paralogs list

'''

In [2]:
all_paralogs_data = pd.read_csv("../data/integrated_paralogs_methods.tsv", sep="\t")

# using paralogs for Tissue inhibitors of metalloproteinases (TIMPs: 1-4)
ref_list = ['P01033', 'P16035', 'P35625',  'Q99727']
TIMPs_homologs = get_required_df(ref_list, all_paralogs_data)
print(len(TIMPs_homologs))
print(TIMPs_homologs.head())


111
  reference      target   methods
0    P01033  A0A1W2PP97  foldseek
1    P01033      O14603  foldseek
2    P01033      O95972  foldseek
3    P01033      P01031  foldseek
4    P01033      P02753  foldseek


In [3]:
def get_qt_pairs(paralogs_df, save_dir, file_name):
    os.makedirs(save_dir, exist_ok=True)
    qt_df = paralogs_df[['reference', 'target']]
    filename = f'{file_name}.csv'
    qt_df.to_csv(os.path.join(save_dir, f"{filename}"), sep="\t", index=False)
    print(f'query-target pairs saved as {filename}')
    return filename
    

In [13]:
TIMPS_qt_pairs = get_qt_pairs(TIMPs_homologs, "../data/", "TIMPs")
print(TIMPS_qt_pairs)

query-target pairs saved as TIMPs.csv
TIMPs.csv


module load tmalign/20220412-xq6w5va

while IFS=$'\t' read -r query target; do
    echo "Running TM-align for: $query vs $target"
    TMalign "../../../data/HS_AF_PDB/${query}.pdb" "../../../data/HS_AF_PDB/${target}.pdb" >> ../data/TIMPs_tmaln.txt
    echo -e "\\n" >> ../data/TIMPs_tmaln.txt
done < "../data/TIMPs.csv"

In [3]:
tm_data = get_tm_aln_df("../data/TIMPs_tmaln.txt")

In [None]:
proc = ProcessData(TIMPs_homologs, tm_data, ref_list)
tm_scores_merged = proc.merge_data()
filtered_data = proc.filter_data()
novel_paralogs = proc.get_novel_list()
print(len(novel_paralogs))

# print(filtered_data)

In [6]:
print(novel_paralogs)

['Q9BQI4', 'P62857', 'Q86WV5']


Get the novel discoveries for each protein family and see the distribution by 
different methods.
--> reference data 
--> integrated homologs + tm_align data 
--> filter by reference data 
--> filter the data by bidirectional TM-scores 
--> get the novel paralogs set 


In [3]:
from get_novel_distribution_methods import get_novel_distribution
# retuns the distribution of novel paralogs by methods 

ref_list = "../data/TF_1373_genes.txt"
data = "../data/TFs_paralogs_tmaln_final.tsv"

methods_dist = get_novel_distribution(ref_list, data)
print(methods_dist)

Total novel paralogs pair is: 162
Total novel paralogs is 38
{'prost': 133, 'mmseqs2': 85, 'foldseek': 132, 'blast': 132}
