In [5]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from goatools.obo_parser import GODag
from goatools.rpt.rpt_lev_depth import RptLevDepth
from goatools.associations import dnld_assc
from goatools.semantic import TermCounts, get_info_content
from goatools.semantic import lin_sim
from goatools.semantic import semantic_distance
import pixiedust

In [6]:
save_path = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/"

In [7]:
path_of_tsv = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/manual_GOA_20191024_propagated.tsv"
propagated_go_terms = pd.read_csv(path_of_tsv, sep='\t')

In [8]:
len(propagated_go_terms)

12539927

Look for duplicated rows

In [9]:
duplicateRowsDF = propagated_go_terms[propagated_go_terms.duplicated()]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

Duplicate Rows except first occurrence based on all columns are :
Empty DataFrame
Columns: [GO_ID, DB_OBJECT_ID, GPAD_QUALIFIER, ECO_ID, GO_EVIDENCE, TAX_ID, ASSIGNED_BY]
Index: []


Propagation control

In [10]:
propagated_go_terms.loc[(propagated_go_terms['GO_ID'] == 'GO:0042802') & (propagated_go_terms['DB_OBJECT_ID'] == 'P02767')]

Unnamed: 0,GO_ID,DB_OBJECT_ID,GPAD_QUALIFIER,ECO_ID,GO_EVIDENCE,TAX_ID,ASSIGNED_BY
1755773,GO:0042802,P02767,enables,ECO:0000266,ISO,10116,RGD


In [11]:
propagated_go_terms.loc[(propagated_go_terms['DB_OBJECT_ID'] == 'P02767')\
                       & (propagated_go_terms['GO_ID'] == 'GO:0003674')]

Unnamed: 0,GO_ID,DB_OBJECT_ID,GPAD_QUALIFIER,ECO_ID,GO_EVIDENCE,TAX_ID,ASSIGNED_BY
195958,GO:0003674,P02767,enables,ECO:0000266,ISO,10116,RGD
316688,GO:0003674,P02767,enables,ECO:0000318,IBA,10116,REFG
373544,GO:0003674,P02767,enables,ECO:0000314,IDA,10116,RGD


In [12]:
propagated_go_terms[0:9]

Unnamed: 0,GO_ID,DB_OBJECT_ID,GPAD_QUALIFIER,ECO_ID,GO_EVIDENCE,TAX_ID,ASSIGNED_BY
0,GO:0071617,P0A257,enables,ECO:0000318,IBA,99287,REFG
1,GO:0071617,Q3UN02,enables,ECO:0000250,ISS,10090,SWIS
2,GO:0071617,Q8K3K7,enables,ECO:0000266,ISO,10090,MGI
3,GO:0071617,O35083,enables,ECO:0000266,ISO,10090,MGI
4,GO:0071617,Q924S1,enables,ECO:0000266,ISO,10116,RGD
5,GO:0071617,Q16635,enables,ECO:0000304,TAS,9606,REAC
6,GO:0071617,Q5E9R2,enables,ECO:0000250,ISS,9913,SWIS
7,GO:0071617,Q6QA69,enables,ECO:0000266,ISO,10116,RGD
8,GO:0071617,Q9D517,enables,ECO:0000250,ISS,10090,SWIS


Get list of human proteins

In [13]:
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t', header=None, names=uniprot_vars)
human_proteins = set(uniprot_df['Entry'])

## 1- Exclude all IEA annotations and select human proteins.

In [14]:
propagated_go_terms = propagated_go_terms.query('GO_EVIDENCE != "IEA"')
propagated_go_terms = propagated_go_terms[propagated_go_terms['DB_OBJECT_ID'].isin(human_proteins)]
propagated_go_terms.reset_index(drop=True, inplace=True)

In [15]:
len(propagated_go_terms)

2450885

In [16]:
propagated_go_terms_human = propagated_go_terms[propagated_go_terms['TAX_ID'] == 9606]

In [17]:
len(propagated_go_terms_human)

2450885

In [20]:
len(set(propagated_go_terms_human['GO_ID']))

20364

In [18]:
len(set(propagated_go_terms_human['DB_OBJECT_ID']))

17928

## 2- For each term get annotated protein list. 

For each protein check if there are multiple proteins (in the annotated protein list for each GO ID), exists in same UNIREF cluster, if so remove the proteins from the annotated list, so that, only 1 protein from an individual UniRef cluster will exist in the annotated list.

In [None]:
path_of_uniref_tab = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/uniref50-swissprot_human.tab"
uniref_human = pd.read_csv(path_of_uniref_tab, sep='\t')
uniref_human[1:5]

In [None]:
GOIDs = list(set(propagated_go_terms['GO_ID']))

In [None]:
def get_cluster_members(protein_id):
    cluster_members_list = []
    cluster_name = "UniRef50_" + protein_id
    cluster_members_list = uniref_human[uniref_human['Cluster ID'] == cluster_name]["Cluster members"].tolist()
    if len(cluster_members_list) > 0:
        cluster_members_list = cluster_members_list[0].split(";")
        cluster_members_list = list(map(str.strip, cluster_members_list))
    return cluster_members_list

In [None]:
def remove_common_proteins(common_proteins,protein_id,goid):
    for common_protein in common_proteins:
        if common_protein != protein_id:
            is_common_protein = propagated_go_terms["DB_OBJECT_ID"] == common_protein
            goid_control = propagated_go_terms['GO_ID']  == goid
            if len(propagated_go_terms[is_common_protein & goid_control]) > 0:
                drop_index = int(propagated_go_terms[is_common_protein & goid_control].index[0])
                propagated_go_terms.drop(propagated_go_terms.index[drop_index], inplace=True)
                propagated_go_terms.reset_index(drop=True, inplace=True)

In [None]:
len(propagated_go_terms)

In [None]:
for goid in tqdm_notebook(GOIDs):
    annotated_proteins = list(propagated_go_terms[propagated_go_terms['GO_ID']  == goid]['DB_OBJECT_ID'])
    # Do not remove same protein multiple times for same annotation
    removed_proteins = []
    for protein_id in annotated_proteins:
        removal_candidates = []
        cluster_members_list = get_cluster_members(protein_id)
        if len(cluster_members_list) > 0:
            common_proteins = []
            common_proteins = np.intersect1d(annotated_proteins, cluster_members_list)
            if len(common_proteins) > 0:
                removal_candidates = []
                removal_candidates = np.setdiff1d(common_proteins, removed_proteins)
                if len(removal_candidates) > 0:
                    remove_common_proteins(removal_candidates,protein_id,goid)

In [22]:
propagated_go_terms_csv_save_path = save_path + 'propagated_go_terms.csv'

In [None]:
#propagated_go_terms.to_csv(propagated_go_terms_csv_save_path)

In [23]:
propagated_go_terms = pd.read_csv(propagated_go_terms_csv_save_path)

In [24]:
len(propagated_go_terms)

2389966

## 3- Save terms with filtered list and save number of annotated proteins with term depth.

In [8]:
def calculate_gene_number_for_goterm(goid):
    annotated_proteins = list(propagated_go_terms[propagated_go_terms['GO_ID']  == goid]['DB_OBJECT_ID'])
    number_of_proteins = len(annotated_proteins)
    return number_of_proteins

## 4- Terms are categorized as low, middle, high in terms of the number of annotated proteins.

 If term annotates 5 - 30 proteins the term is considered in low category. If term annotates 30 - 100 proteins the term is considered in middle category. If term annotates more than 100 proteins the term is considered in high category.

In [9]:
def determine_number_category(term_number):
    if term_number < 5:
        return "insufficient"
    if term_number >= 5 and term_number <= 30:
        return "Low"
    if term_number > 100 and term_number <= 500:
        return "Middle"
    if term_number >= 1000:
        return "High"

## 5- Term’s specificity is determined and terms are categorized as shallow, normal, specific. 

First ⅓ of the max depth of the aspect (MF, BP or CC) is considered as shallow, second ⅓ of the aspect is considered as normal, deepest ⅓ is considered as specific. Be careful here about the fact that branch lengths are different from each other, as a result, while calculating a term’s specificity, you should check the lineage of that term, on many steps we can reach the root term from the corresponding term (in minimum number of steps, because there are usually multiple routes), and how many descendant terms (child-grandchild-grandgrandchild) terms it has...

In [12]:
obo_path = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/go.obo"
obodag = GODag(obo_path)

#Reports the number of GO terms at each level and depth.
# Level refers to the length of the shortest path from the top.
# Depth refers to the length of the longest path from the top.
rptobj = RptLevDepth(obodag)
rptobj.write_summary_cnts_all()

/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/go.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms
Dep <-Depth Counts->  <-Level Counts->
Lev   BP    MF    CC    BP    MF    CC
--- ----  ----  ----  ----  ----  ----
00     1     1     1     1     1     1
01    30    18     7    30    18     7
02   262   127   927   425   152   929
03  1271   569   686  2220   850   780
04  2381  1517   844  4812  2040  1019
05  3670  4789   707  7258  5023   705
06  4469  1856   474  7277  1958   404
07  4676  1107   303  4591   758   205
08  4230   530   156  1969   197   106
09  3510   320    59   599    67    25
10  2383   149    18   237    10     1
11  1440    98     1    38    19     1
12   789    12     0     0     0     0
13   271     0     0     0     0     0
14    59     0     0     0     0     0
15    11     0     0     0     0     0
16     4     0     0     0     0     0


In [11]:
def calculate_term_depth(goid):
    return obodag[goid].level

In [12]:
def calculate_term_specificty(go_id_level):
    if go_id_level >= 8:
        return "Specific"
    if go_id_level >= 5 and go_id_level < 8:
        return "Normal"
    if go_id_level < 5:
        return "Shallow"

In [73]:
#GOIDs_Human = GOIDs = list(set(propagated_go_terms_human['GO_ID']))
GOIDs_Human = list(set(propagated_go_terms_human['GO_ID']))
len(GOIDs_Human)

20364

In [None]:
go_dataframe = pd.DataFrame(columns = ['GO_ID', 'Term_Level',\
                                       'Protein_Number', 'Number_Category',\
                                       'Term_Specificity','Aspect'])

for go_id in tqdm_notebook(GOIDs_Human):
    go_id_level = obodag[go_id].level
    aspect = obodag[go_id].namespace
    number_of_proteins = calculate_gene_number_for_goterm(go_id)
    go_id_number_category = determine_number_category(number_of_proteins)
    term_specificity = calculate_term_specificty(go_id_level)
    go_dataframe = go_dataframe.append({'GO_ID': go_id, \
                                        'Term_Level': go_id_level, \
                                       'Protein_Number': number_of_proteins,\
                                       'Number_Category': go_id_number_category,\
                                       'Term_Specificity': term_specificity,\
                                       'Aspect': aspect}
                                       ,ignore_index=True)


len(go_dataframe)
    

In [4]:
csv_save_path = save_path + 'go_dataframe.csv'
#go_dataframe.to_csv(csv_save_path)
#go_dataframe.to_pickle(save_path + 'go_dataframe.pkl')

In [5]:
#go_dataframe = pd.read_csv(csv_save_path)
go_dataframe = pd.read_pickle(save_path + 'go_dataframe.pkl')

## 6- Term selection will be according to 4 and 5 as below with 9 groups 5 terms from each group will be selected randomly

For each group Lin similarity is calculated and 5 samples will be chosen  from most dissimilar terms which is calculated based on mean similarity of each term.

In [6]:
go_category_dataframe = pd.DataFrame(columns = ['Aspect','Number_Category', 'Term_Specificity',\
                                       'GO_IDs'])#,'Similarity_Matrix','Dissimilar_Terms'])

aspects = ('cellular_component','biological_process','molecular_function')
number_categories = ('Low','Middle','High')
term_specificities = ('Shallow','Normal','Specific')

for aspect in tqdm_notebook(aspects):
    go_dataframeAspect = go_dataframe.loc[(go_dataframe['Aspect'] == aspect)]
    for number_category in number_categories:
        for term_specificity in term_specificities:
            go_ids = list(set(go_dataframeAspect.loc[(go_dataframe['Number_Category'] == number_category) \
                                   & (go_dataframe['Term_Specificity'] == term_specificity)]['GO_ID']))
            
            go_category_dataframe = go_category_dataframe.append({'Aspect': aspect, \
                                        'Number_Category': number_category, \
                                       'Term_Specificity': term_specificity,\
                                       'GO_IDs': go_ids}
                                       ,ignore_index=True)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [7]:
go_category_dataframe

Unnamed: 0,Aspect,Number_Category,Term_Specificity,GO_IDs
0,cellular_component,Low,Shallow,"[GO:0000347, GO:0034719, GO:0008541, GO:000595..."
1,cellular_component,Low,Normal,"[GO:0098536, GO:0036156, GO:1990316, GO:000838..."
2,cellular_component,Low,Specific,"[GO:0005828, GO:0042583, GO:0070776, GO:009883..."
3,cellular_component,Middle,Shallow,"[GO:0097525, GO:0030017, GO:0043296, GO:009879..."
4,cellular_component,Middle,Normal,"[GO:0045111, GO:0030016, GO:0072686, GO:000581..."
5,cellular_component,Middle,Specific,"[GO:0016591, GO:0008021, GO:0101002, GO:000576..."
6,cellular_component,High,Shallow,"[GO:0016020, GO:0016021, GO:1990904, GO:000078..."
7,cellular_component,High,Normal,"[GO:0031981, GO:0005634, GO:0000228, GO:007006..."
8,cellular_component,High,Specific,[]
9,biological_process,Low,Shallow,"[GO:0007379, GO:0061900, GO:0070358, GO:001593..."


For each group all pairwise similarities will be checked. 

If any pair have similarity score over 0.25 then the second element of the pair will be deleted from the list and a next element in the sorted mean similarity list will be added. This loop will be applied until no pair have similarity score over 0.25.

In [44]:
def check_similarity_threshold(dissimilar_go_terms,go_id_of_min_similarity,similarity_matrix):
    for go_term in dissimilar_go_terms:
        if similarity_matrix.at[str(go_term), str(go_id_of_min_similarity)] > 0.25:
            return False
    return True
        

In [45]:
def get_n_minimums(d, n,similarity_matrix):
    dissimilar_go_terms = set()
    i=0
    similarity_check = True
    while(i<n):
        if len(d.keys()) > 0:
            min_key = min(d.keys(), key=lambda k: d[k])   
            similarity_check = check_similarity_threshold(dissimilar_go_terms,min_key,similarity_matrix)
            if similarity_check:
                dissimilar_go_terms.add(min_key)
                del d[min_key]
                i = i+1
            else:
                del d[min_key]
        else:
            return dissimilar_go_terms
    return dissimilar_go_terms

In [46]:
def get_most_dissimilar_terms(similarity_matrix,n):
    mean_similarity_dict = {}
    for go_id in list(set(similarity_matrix.columns)):
        mean_similarity_dict[go_id] = similarity_matrix[go_id].mean(axis=0)
    dissimilar_go_terms = get_n_minimums(mean_similarity_dict,n,similarity_matrix)
    return dissimilar_go_terms

In [47]:
fin_gaf = os.path.join(os.getcwd(), "goa_human.gaf")
associationsBP = dnld_assc(fin_gaf, obodag, namespace='BP')
associationsMF = dnld_assc(fin_gaf, obodag, namespace='MF')
associationsCC = dnld_assc(fin_gaf, obodag, namespace='CC')
# First get the counts of each GO term.
termcountsBP = TermCounts(obodag, associationsBP)
termcountsMF = TermCounts(obodag, associationsMF)
termcountsCC = TermCounts(obodag, associationsCC)

HMS:0:00:08.403387 487,062 annotations READ: /media/DATA/serbulent/Code/Thesis/ReviewPaper/GO_Prediction/goa_human.gaf 
17542 IDs in loaded association branch, BP
HMS:0:00:08.222395 487,062 annotations READ: /media/DATA/serbulent/Code/Thesis/ReviewPaper/GO_Prediction/goa_human.gaf 
17380 IDs in loaded association branch, MF
HMS:0:00:06.960854 487,062 annotations READ: /media/DATA/serbulent/Code/Thesis/ReviewPaper/GO_Prediction/goa_human.gaf 
18754 IDs in loaded association branch, CC


In [48]:
def semantic_similarity_srb(go_id1, go_id2, godag, branch_dist=None):
    '''
        Finds the semantic similarity (inverse of the semantic distance)
        between two GO terms.
    '''
    dist = semantic_distance(go_id1, go_id2, godag, branch_dist)
    if dist == 0:
        return 1
    if dist is not None:
        return 1.0 / float(dist)
    return None

In [49]:
def calculate_similarity_matrix(go_ids,aspect):
    similarity_matrix = pd.DataFrame(index = go_ids, columns = go_ids)
    if aspect == 'biological_process':
        termcounts = termcountsBP
    if aspect == 'molecular_function':
        termcounts = termcountsMF
    if aspect == 'cellular_component':
        termcounts = termcountsCC
    
    for i in tqdm_notebook(range(len(go_ids))):
        for j in range(len(go_ids)):
            term1 = str(go_ids[i])
            term2 = str(go_ids[j])
            sim_l = semantic_similarity_srb(term1,term2, obodag, termcounts)
            #print('Lin similarity score ({}, {}) = {}'.format(go_ids[i], go_ids[j], sim_l))
            similarity_matrix.at[term1, term2] = sim_l
    return similarity_matrix

In [50]:
#%%pixie_debugger
calculate_similarity_matrix(['GO:0101002','GO:0042581'],'cellular_component')

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




Unnamed: 0,GO:0101002,GO:0042581
GO:0101002,1.0,0.5
GO:0042581,0.5,1.0


In [51]:
# find_dissimilar_samples
go_category_dataframe["Similarity_Matrix"] = ""
go_category_dataframe["Dissimilar_Terms"] = ""
for index, row in tqdm_notebook(go_category_dataframe.iterrows(), total=go_category_dataframe.shape[0]):
#for index, row in go_category_dataframe.iterrows():
    #if len(row.GO_IDs) > 5:
    similarity_matrix = calculate_similarity_matrix(row.GO_IDs,row.Aspect)
    go_category_dataframe.at[index,"Similarity_Matrix"] = similarity_matrix
    go_category_dataframe.at[index,"Dissimilar_Terms"] = get_most_dissimilar_terms(similarity_matrix,5)

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))

HBox(children=(IntProgress(value=0, max=431), HTML(value='')))

HBox(children=(IntProgress(value=0, max=201), HTML(value='')))

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))

HBox(children=(IntProgress(value=0, max=118), HTML(value='')))

HBox(children=(IntProgress(value=0, max=62), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=49), HTML(value='')))

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1144), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3179), HTML(value='')))

HBox(children=(IntProgress(value=0, max=470), HTML(value='')))

HBox(children=(IntProgress(value=0, max=462), HTML(value='')))

HBox(children=(IntProgress(value=0, max=579), HTML(value='')))

HBox(children=(IntProgress(value=0, max=46), HTML(value='')))

HBox(children=(IntProgress(value=0, max=186), HTML(value='')))

HBox(children=(IntProgress(value=0, max=67), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=574), HTML(value='')))

HBox(children=(IntProgress(value=0, max=820), HTML(value='')))

HBox(children=(IntProgress(value=0, max=54), HTML(value='')))

HBox(children=(IntProgress(value=0, max=143), HTML(value='')))

HBox(children=(IntProgress(value=0, max=81), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [54]:
go_category_dataframe.to_pickle(save_path + 'go_category_dataframe.pkl')

In [4]:
go_category_dataframe = pd.read_pickle(save_path + 'go_category_dataframe.pkl')

In [61]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
go_category_dataframe[['Aspect','Number_Category','Term_Specificity','Dissimilar_Terms']]

Unnamed: 0,Aspect,Number_Category,Term_Specificity,Dissimilar_Terms
0,cellular_component,Low,Shallow,"{GO:0000124, GO:0097165, GO:0008024, GO:0032590, GO:0033162}"
1,cellular_component,Low,Normal,"{GO:0036020, GO:0070081, GO:0044754, GO:1990454, GO:0031089}"
2,cellular_component,Low,Specific,"{GO:0019908, GO:0005767, GO:0032009, GO:0005736, GO:0099061}"
3,cellular_component,Middle,Shallow,"{GO:0008076, GO:0005762, GO:0005747, GO:0036464, GO:0044853}"
4,cellular_component,Middle,Normal,"{GO:0000502, GO:0005925, GO:0022627, GO:0030665, GO:0034705}"
5,cellular_component,Middle,Specific,"{GO:0005766, GO:0016591, GO:0101002, GO:0008021}"
6,cellular_component,High,Shallow,"{GO:0005789, GO:1990234, GO:1903561, GO:0005740, GO:0070013}"
7,cellular_component,High,Normal,"{GO:0070062, GO:0000228, GO:0031981, GO:0015630, GO:0005768}"
8,cellular_component,High,Specific,{}
9,biological_process,Low,Shallow,"{GO:0032933, GO:0006348, GO:0042989, GO:0000054, GO:0039529}"


check if is there any value greater than 1 to see any bug exits

In [6]:
for sim_matrix in go_category_dataframe['Similarity_Matrix']:
    isDF = type(sim_matrix) is pd.core.frame.DataFrame
    if (isDF and True in set(sim_matrix.gt(1).any())):
        print(sim_matrix)

In [25]:
dissimilar_terms_dataframe = pd.DataFrame(columns = ['GO_ID', 'Number_of_Proteins', 'Proteins'])

for go_ids in list(go_category_dataframe["Dissimilar_Terms"]):
    for go_id in go_ids:
        annotated_proteins = list(set(propagated_go_terms \
                                      .loc[(propagated_go_terms['GO_ID'] == go_id)]['DB_OBJECT_ID']))
        number_of_proteins = len(annotated_proteins)
        dissimilar_terms_dataframe = dissimilar_terms_dataframe.append({'GO_ID': go_id,'Number_of_Proteins':number_of_proteins, 'Proteins': annotated_proteins},ignore_index=True)

In [26]:
dissimilar_terms_dataframe

Unnamed: 0,GO_ID,Number_of_Proteins,Proteins
0,GO:0000124,12,"[Q9HBM6, Q16514, Q9UPT9, P0C7V6, Q8NEM7, Q1659..."
1,GO:0097165,5,"[Q92797, P31483, Q08211, Q00613, P63165]"
2,GO:0008024,7,"[Q14004, P50750, O60563, O60583, O75909, Q1357..."
3,GO:0032590,25,"[P24588, Q99527, P04216, O43556, Q14003, P0621..."
4,GO:0033162,10,"[P51159, P07101, Q13637, P51810, P17643, P4012..."
5,GO:0036020,14,"[Q9NR96, O94973, Q2KHT3, Q00610, Q8NBP7, Q9NYK..."
6,GO:0070081,5,"[P54219, P20336, P21579, Q05940, P63027]"
7,GO:0044754,8,"[Q13501, P02794, Q9H492, P02792, Q8NEB9, Q5S00..."
8,GO:1990454,12,"[Q08289, Q01668, Q9UBN1, Q13698, P54284, P6295..."
9,GO:0031089,14,"[Q14624, Q7L3B6, P35625, P05452, P29622, Q9996..."


In [4]:
dissimilar_terms_dataframe_save_path = save_path + 'dissimilar_terms_dataframe.pkl'
#dissimilar_terms_dataframe.to_pickle(dissimilar_terms_dataframe_save_path)

In [5]:
dissimilar_terms_dataframe = pd.read_pickle(dissimilar_terms_dataframe_save_path)

In [30]:
save_path + 'go_category_dataframe.pkl'

'/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/go_category_dataframe.pkl'

In [31]:
dissimilar_terms_dataframe_save_path

'/media/DATA/serbulent/DATA/Thesis/ReviewPaper/preprocess/go_related_data/dissimilar_terms_dataframe.pkl'

In [55]:
go_stats_dataframe = pd.DataFrame(columns = ['Aspect', 'Number_Category', 'Term_Specificity', 'Annotated_Protein_Number'])

for index,row in go_category_dataframe.iterrows():
    annotated_protein_number = 0
    for go_term in row['Dissimilar_Terms']:
        annotated_protein_number += int(dissimilar_terms_dataframe\
                  .loc[(dissimilar_terms_dataframe['GO_ID'] == go_term)]['Number_of_Proteins'])
    
    aspect = row['Aspect']
    number_category = row['Number_Category']
    term_specificity = row['Term_Specificity'] 
    go_stats_dataframe = go_stats_dataframe\
    .append({'Aspect': aspect,'Number_Category':number_category,'Term_Specificity': term_specificity,\
             'Annotated_Protein_Number':annotated_protein_number},ignore_index=True)
        
    

In [56]:
go_stats_dataframe

Unnamed: 0,Aspect,Number_Category,Term_Specificity,Annotated_Protein_Number
0,cellular_component,Low,Shallow,59
1,cellular_component,Low,Normal,53
2,cellular_component,Low,Specific,57
3,cellular_component,Middle,Shallow,466
4,cellular_component,Middle,Normal,650
5,cellular_component,Middle,Specific,475
6,cellular_component,High,Shallow,9589
7,cellular_component,High,Normal,9376
8,cellular_component,High,Specific,0
9,biological_process,Low,Shallow,33
