In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [18]:
#read in mapping file
#url pfam to TC mapping file: https://tcdb.org/cgi-bin/projectv/public/pfam.py
tc2p_df = pd.read_csv('../data/pfam.py',
  engine = 'python',
  sep = '\t',
  usecols= [0,1],
  names = ['pfam_ID', 'TC'])
tc2p_df

Unnamed: 0,pfam_ID,TC
0,PF07885,1.A.1.1.1
1,PF07885,1.A.1.29.1
2,PF00520,1.A.1.2.2
3,PF02214,1.A.1.2.2
4,PF03493,1.A.1.3.1
...,...,...
8862,PF02462,1.B.6.2.7
8863,PF02530,1.B.70.1.6
8864,PF02537,1.A.43.1.2
8865,TesT,3.A.1.1.1


In [19]:
#read in tcDoms hmm name file
#obtained from downloading tcdoms hmm file & going into tcdoms global folder
#URL: https://tcdb.org/public/tcDoms.tar.gz
tchmms_df = pd.read_csv('../data/tcDoms/tcDomsGlobal/hmmsName.txt',
  engine = 'python',
  sep = '\s+',
  skiprows = [0],
  usecols = [1],
  names = ['hmm_id']
)
tchmms_df

Unnamed: 0,hmm_id
0,1.A.1.TD000002.0
1,1.A.2.TD000001.0
2,1.A.2.TD000002.0
3,1.A.3.TD000001.0
4,1.A.4.TD000001.0
...,...
356,9.B.330.TD000001.0
357,9.B.353.TD000001.0
358,9.B.358.TD000001.0
359,9.B.358.TD000002.0


In [20]:
#formatting tchmms_df
tc_mappings = pd.DataFrame(columns = ['hmm_id', 'class_', 'subclass', 'family', 'subfamily', 'substrate'])
for tc_id in tchmms_df.hmm_id:
    str_list = tc_id.split(".")
    class_ = str_list[0]
    subclass = str_list[1]
    family = str_list[2]
    subfamily = '-'
    substrate = '-'
    tc_mappings.loc[len(tc_mappings.index)] = [tc_id, class_, subclass, family, subfamily, substrate]
# show mappings
tc_mappings

Unnamed: 0,hmm_id,class_,subclass,family,subfamily,substrate
0,1.A.1.TD000002.0,1,A,1,-,-
1,1.A.2.TD000001.0,1,A,2,-,-
2,1.A.2.TD000002.0,1,A,2,-,-
3,1.A.3.TD000001.0,1,A,3,-,-
4,1.A.4.TD000001.0,1,A,4,-,-
...,...,...,...,...,...,...
356,9.B.330.TD000001.0,9,B,330,-,-
357,9.B.353.TD000001.0,9,B,353,-,-
358,9.B.358.TD000001.0,9,B,358,-,-
359,9.B.358.TD000002.0,9,B,358,-,-


In [21]:
#Create a dataframe of boolean values denoting for each pfam ID weather each f

#group data by pfam hmm id
groupby_tc2pf = tc2p_df.groupby('pfam_ID')
#create df to store true/false ineger values for each field's uniqueness 
pfams_to_map_df = pd.DataFrame(columns = ['hmm_id', 'class_', 'subclass', 'family', 'subfamily', 'substrate'])
#note family can actually be superfamiliy at times

#for each pfam, see if each field is unique or not 
for P in   tc2p_df['pfam_ID'].unique():
    a = np.array([])
    b = np.array([])
    c = np.array([])
    d = np.array([])
    e = np.array([])
    for T in groupby_tc2pf.get_group(P).TC:
        tc = T.split ('.')
        #tc =  list ex: ['1', 'A', '1', '5', '11']
        a = np.append(a, tc[0])
        b = np.append(b, tc[1])
        c = np.append(c, tc[2])
        d = np.append(d, tc[3])
        e = np.append(e, tc[4])
    #if class has only 1 unique value
    #then add to pfam + uniqueness of their fields to mapping df
    # 1 = only 1 field value, 0 = multiple field values 
    if ((np.unique(a)).size == 1):
      pfams_to_map_df.loc[len(pfams_to_map_df.index)] = [P, int((np.unique(a)).size == 1), int((np.unique(b)).size == 1),
                                                         int((np.unique(c)).size == 1),int( (np.unique(d)).size == 1),
                                                         int((np.unique(e)).size == 1) ]
pfams_to_map_df = pfams_to_map_df.reset_index(drop=True)
pfams_to_map_df

Unnamed: 0,hmm_id,class_,subclass,family,subfamily,substrate
0,PF03493,1,1,1,1,0
1,PF11834,1,1,1,1,0
2,PF08412,1,1,1,1,0
3,PF08763,1,1,1,1,0
4,PF00060,1,1,1,0,0
...,...,...,...,...,...,...
1227,PF08627,1,1,1,1,1
1228,PF13953,1,1,1,0,0
1229,PF13954,1,1,1,0,0
1230,PF02462,1,1,1,1,0


In [22]:
#map booleans back to their corresponding TcDoms ID

final_mapping_df = pd.DataFrame(columns = ['hmm_id', 'class_', 'subclass', 'family', 'subfamily', 'substrate'])

#for each pfam_id, map the unique values to their respective TC IDs
for id in pfams_to_map_df.hmm_id:
    #get first TC_ID corresponding to respective pfam_ID out of the groupby object 
    temp_str = groupby_tc2pf.get_group(id).iloc[0]['TC']
    #split into fields 
    temp_list = temp_str.split(".")
    class_ = '-'
    subclass = '-'
    family = '-'
    subfamily = '-'
    substrate = '-'
    for l in ['class_', 'subclass', 'family', 'subfamily', 'substrate']:
        #extracting the value of the  the specific field that corresponds to current l field 
        field = pfams_to_map_df.loc[pfams_to_map_df['hmm_id'] == id, l].iloc[0]
        #if the cuurent field is unique, update the field with the respective TC ID value 
        if field == 1:
            if l == "class_":
                class_ = temp_list[0]
            if l == "subclass":
                subclass = temp_list[1]
            if l == "family":
                family = temp_list[2]
            if l == "subfamily":
                subfamily = temp_list[3]
            if l == "substrate":
                substrate = temp_list[4]
        else:
          break
    final_mapping_df.loc[len(final_mapping_df.index)] = [id, class_ , subclass, family, subfamily, substrate]

final_mapping_df


Unnamed: 0,hmm_id,class_,subclass,family,subfamily,substrate
0,PF03493,1,A,1,3,-
1,PF11834,1,A,1,4,-
2,PF08412,1,A,1,5,-
3,PF08763,1,A,1,11,-
4,PF00060,1,A,10,-,-
...,...,...,...,...,...,...
1227,PF08627,2,A,7,20,2
1228,PF13953,1,B,11,-,-
1229,PF13954,1,B,11,-,-
1230,PF02462,1,B,6,2,-


In [23]:
#add in TC hmm_ID to TC ID mappings

final_mapping_df = pd.concat([final_mapping_df, tc_mappings])
final_mapping_df = final_mapping_df.reset_index(drop =True)
final_mapping_df.to_csv('../data/custom_mapping.csv', index=False)
final_mapping_df


Unnamed: 0,hmm_id,class_,subclass,family,subfamily,substrate
0,PF03493,1,A,1,3,-
1,PF11834,1,A,1,4,-
2,PF08412,1,A,1,5,-
3,PF08763,1,A,1,11,-
4,PF00060,1,A,10,-,-
...,...,...,...,...,...,...
1588,9.B.330.TD000001.0,9,B,330,-,-
1589,9.B.353.TD000001.0,9,B,353,-,-
1590,9.B.358.TD000001.0,9,B,358,-,-
1591,9.B.358.TD000002.0,9,B,358,-,-
