In [1]:
from Similarity_organize import *
import pickle
import os
import re
import pyreadr
import pandas as pd
import numpy as np
import requests, sys
from itertools import combinations
import seaborn as sns
from copy import deepcopy
import matplotlib.pyplot as plt

from goatools.semantic import TermCounts, get_info_content, common_parent_go_ids, min_branch_length, semantic_distance, semantic_similarity, deepest_common_ancestor
from goatools.semantic import resnik_sim, lin_sim
from goatools.associations import read_associations, dnld_assc
from goatools.base import get_godag
from goatools.obo_parser import GODag

## Load data

In [1]:
#### Setting
exp_name = 'PXD002892'
exp_path = ['FREEPII_github/code/FREEPII/Cluster/' + i for i in os.listdir('/FREEPII_github/code/FREEPII/Cluster/')]
exp_path = [i for i in exp_path if ('txt' in i)]
print(*exp_path, sep='\n')

exp_cond = sorted(set([i.split('/')[-1].split('_')[0] for i in exp_path]))
print(exp_cond)

load_dir1 = '/FREEPII_github/input/' + exp_name
dict_path = ['/'.join([load_dir1, i]) for i in os.listdir(load_dir1) if 'idx_dict' in i]
dict_path = [i for i in dict_path if 'SEC2' in i]
dict_path = [i for i in dict_path if (re.sub('.pickle', '', i.split('/')[-1].split('_')[-1]) in exp_cond)]
print(*dict_path, sep='\n')

go_dir = '/FREEPII_github/GO-data'

### Exp-name-idx_dictionary

In [2]:
name_idx_dict = []
idx_name_dict = []
dict_idx_list = []
for cur_dict_path in dict_path:
    with open(cur_dict_path, 'rb') as f:
        temp = pickle.load(f)
    temp_ = dict(zip(list(temp.values()) , list(temp.keys())))
    name_idx_dict.append(temp)
    idx_name_dict.append(temp_)
    dict_idx_list.append(re.sub('.pickle', '', cur_dict_path.split('/')[-1].split('_')[-1]))

print(len(name_idx_dict))
print([len(i) for i in name_idx_dict])
print('*'*50)

print(len(idx_name_dict))
print([len(i) for i in idx_name_dict])
print('*'*50)

print(dict_idx_list)

### Clusters

In [3]:
idx_clusters = deepcopy(exp_cond)
for l in range(len(exp_cond)):
    with open(exp_path[l], 'r') as fh:
        cur_idx_clusters = []
        for line in fh:
            line = line.strip()
            cur_idx_clusters.append(line)
    idx_clusters[l] = cur_idx_clusters
print(len(idx_clusters))
print([len(idx_clusters[i]) for i in range(len(idx_clusters))])
print(*idx_clusters[0][:5], sep='\n')

### GO

In [5]:
go = get_godag(go_dir + '/go-basic.obo', optional_attrs={'relationship'})
# go

### Association

In [6]:
assocs_symbol_BP = read_associations(go_dir + '/association_symbol_BP_Human.txt')
assocs_symbol_MF = read_associations(go_dir + '/association_symbol_MF_Human.txt')
assocs_symbol_CC = read_associations(go_dir + '/association_symbol_CC_Human.txt')

In [7]:
assocs_synonym_BP = read_associations(go_dir + '/association_synonym_BP_Human.txt')
assocs_synonym_MF = read_associations(go_dir + '/association_synonym_MF_Human.txt')
assocs_synonym_CC = read_associations(go_dir + '/association_synonym_CC_Human.txt')

## Match idx in cluster to exp-name

In [8]:
name_clusters = deepcopy(idx_clusters)
for l in range(len(idx_clusters)):
    cur_dict = idx_name_dict[[i for i,v in enumerate(dict_idx_list) if exp_cond[l] in v][0]]
    print(len(cur_dict))
    cur_name_cluster = [[cur_dict[int(i)] for i in idx_clusters[l][j].split(' ')] for j in range(len(idx_clusters[l]))]
    name_clusters[l] = cur_name_cluster
print(len(name_clusters))
print([len(name_clusters[i]) for i in range(len(name_clusters))])
print(*name_clusters[0][:5], sep='\n')

## Calculate sementic similarity

### Turn complex with go id set

In [9]:
go_cluster_BP = [[[list(assocs_synonym_BP.get(i, {})) for i in j] for j in k] for k in name_clusters]
go_cluster_MF = [[[list(assocs_synonym_MF.get(i, {})) for i in j] for j in k] for k in name_clusters]
go_cluster_CC = [[[list(assocs_synonym_CC.get(i, {})) for i in j] for j in k] for k in name_clusters]
print(len(go_cluster_BP), len(go_cluster_MF), len(go_cluster_CC))
print([len(i) for i in go_cluster_BP], [len(i) for i in go_cluster_MF], [len(i) for i in go_cluster_CC])

### Filter complex protein without go-term

In [10]:
go_cluster_BP = [[[i for i in j if i!=[]] for j in k] for k in go_cluster_BP]
go_cluster_MF = [[[i for i in j if i!=[]] for j in k] for k in go_cluster_MF]
go_cluster_CC = [[[i for i in j if i!=[]] for j in k] for k in go_cluster_CC]
print(len(go_cluster_BP), len(go_cluster_MF), len(go_cluster_CC))
print([len(i) for i in go_cluster_BP], [len(i) for i in go_cluster_MF], [len(i) for i in go_cluster_CC])
print('*'*50)
print([sum([len(i)==0 for i in k]) for k in go_cluster_BP]) # all empty complex
print([sum([len(i)==1 for i in k]) for k in go_cluster_BP]) # complex with single member
print([sum([len(i)==2 for i in k]) for k in go_cluster_BP]) # edge
print([sum([len(i) >2 for i in k]) for k in go_cluster_BP])
print('*'*25)
print([sum([len(i)==0 for i in k]) for k in go_cluster_MF]) # all empty complex
print([sum([len(i)==1 for i in k]) for k in go_cluster_MF]) # complex with single member
print([sum([len(i)==2 for i in k]) for k in go_cluster_MF]) # edge
print([sum([len(i) >2 for i in k]) for k in go_cluster_MF])
print('*'*25)
print([sum([len(i)==0 for i in k]) for k in go_cluster_CC]) # all empty complex
print([sum([len(i)==1 for i in k]) for k in go_cluster_CC]) # complex with single member
print([sum([len(i)==2 for i in k]) for k in go_cluster_CC]) # edge
print([sum([len(i) >2 for i in k]) for k in go_cluster_CC])

### Calculate similarity of pair-wise go id in cluster

In [12]:
termcounts_BP = TermCounts(go, assocs_synonym_BP)
termcounts_MF = TermCounts(go, assocs_synonym_MF)
termcounts_CC = TermCounts(go, assocs_synonym_CC)

In [11]:
GOGO_com_score_BP = deepcopy(go_cluster_BP)
GOGO_com_score_MF = deepcopy(go_cluster_MF)
GOGO_com_score_CC = deepcopy(go_cluster_CC)

for l in range(len(go_cluster_BP)):
    for idx in range(len(go_cluster_BP[l])):
        if len(go_cluster_BP[l][idx]) <2:
            cur_score_BP = 0
        elif len(go_cluster_BP[l][idx])==2:
            cur_score_BP = Similarity_of_Set_of_GOTerms(go_cluster_BP[l][idx][0], go_cluster_BP[l][idx][1], go, 'GOGO', silent=True)
        elif len(go_cluster_BP[l][idx]) >2:
            cur_BP_pair = list(combinations(go_cluster_BP[l][idx], 2))
            cur_score_BP = np.mean([Similarity_of_Set_of_GOTerms(i[0], i[1], go, 'GOGO', silent=True) for i in cur_BP_pair])

        if len(go_cluster_MF[l][idx]) <2:
            cur_score_MF = 0
        elif len(go_cluster_MF[l][idx])==2:
            cur_score_MF = Similarity_of_Set_of_GOTerms(go_cluster_MF[l][idx][0], go_cluster_MF[l][idx][1], go, 'GOGO', silent=True)
        elif len(go_cluster_MF[l][idx]) >2:
            cur_MF_pair = list(combinations(go_cluster_MF[l][idx], 2))
            cur_score_MF = np.mean([Similarity_of_Set_of_GOTerms(i[0], i[1], go, 'GOGO', silent=True) for i in cur_MF_pair])

        if len(go_cluster_CC[l][idx]) <2:
            cur_score_CC = 0
        elif len(go_cluster_CC[l][idx])==2:
            cur_score_CC = Similarity_of_Set_of_GOTerms(go_cluster_CC[l][idx][0], go_cluster_CC[l][idx][1], go, 'GOGO', silent=True)
        elif len(go_cluster_CC[l][idx]) >2:
            cur_CC_pair = list(combinations(go_cluster_CC[l][idx], 2))
            cur_score_CC = np.mean([Similarity_of_Set_of_GOTerms(i[0], i[1], go, 'GOGO', silent=True) for i in cur_CC_pair])

        GOGO_com_score_BP[l][idx] = cur_score_BP
        GOGO_com_score_MF[l][idx] = cur_score_MF
        GOGO_com_score_CC[l][idx] = cur_score_CC

        if idx % 5 ==0:
            print(l, np.round(idx/len(go_cluster_BP[l]), 3))

## Plot

In [12]:
char = 'Sementic similarity (Complex, GOGO, BP)'
for i in range(len(GOGO_com_score_BP)):
    if (i==0):
        df = pd.Series(GOGO_com_score_BP[i], name = exp_cond[i]).to_frame()
    else:
        df = df.join(pd.Series(GOGO_com_score_BP[i], name = exp_cond[i]))
fig, ax = plt.subplots(figsize=(14, 7))
sns.boxplot(data=df, width = 0.5, ax=ax).set(title=char)
plt.show()

In [13]:
char = 'Sementic similarity (Complex, GOGO, MF)'
for i in range(len(GOGO_com_score_MF)):
    if (i==0):
        df = pd.Series(GOGO_com_score_MF[i], name = exp_cond[i]).to_frame()
    else:
        df = df.join(pd.Series(GOGO_com_score_MF[i], name = exp_cond[i]))
fig, ax = plt.subplots(figsize=(14, 7))
sns.boxplot(data=df, width = 0.5, ax=ax).set(title=char)
plt.show()

In [14]:
char = 'Sementic similarity (Complex, GOGO, CC)'
for i in range(len(GOGO_com_score_CC)):
    if (i==0):
        df = pd.Series(GOGO_com_score_CC[i], name = exp_cond[i]).to_frame()
    else:
        df = df.join(pd.Series(GOGO_com_score_CC[i], name = exp_cond[i]))
fig, ax = plt.subplots(figsize=(14, 7))
sns.boxplot(data=df, width = 0.5, ax=ax).set(title=char)
plt.show()

## Save scores

In [17]:
out_path = './Go_score'
if not os.path.exists(out_path):
      os.makedirs(out_path)

In [18]:
for l in range(len(exp_cond)):
    np.save('/'.join(['./Go_score', '_'.join([exp_cond[l], 'GOGO_com_score_BP'])]), GOGO_com_score_BP[l])
    np.save('/'.join(['./Go_score', '_'.join([exp_cond[l], 'GOGO_com_score_MF'])]), GOGO_com_score_MF[l])
    np.save('/'.join(['./Go_score', '_'.join([exp_cond[l], 'GOGO_com_score_CC'])]), GOGO_com_score_CC[l])