In [1]:
import json
from collections import defaultdict
import pandas as pd
from pathlib import Path
SHORT_TYPES = {'Biological_process': 'BP', 'Molecular_function': 'MF', 'Cellular_component': 'CC'}
DIR = Path('/home/adam/Workspace/C#/SecStrAnnot2_data/SecStrAPI/CytochromesP450-20200707-full/GO')

In [2]:
# READ GO ANNOTATIONS
GO_TERMS_FILE = DIR / 'SetNR-GO.json'
with open(GO_TERMS_FILE) as r:
    js = json.load(r)
domain_to_terms = js['terms']
term_to_description = js['descriptions']
domain_to_uniprot = js['uniprot']
domain_to_uniprot_name = js['uniprot_name']
domains = sorted(domain_to_terms.keys())
terms = sorted(term_to_description.keys())
len(domains), len(terms)

(183, 360)

In [3]:
term_to_domains = defaultdict(list)
for domain, ts in domain_to_terms.items():
    for term in ts:
        term_to_domains[term].append(domain)

In [4]:
# SELECT TAXON GROUP ASSIGNMENTS FOR RELEVANT DOMAINS
DOMAIN_TAXON_GROUPS_ORIG = DIR / '..' / 'domains_taxons_groups.tsv'
DOMAIN_TAXON_GROUPS = DIR / 'domains_taxons_groups.tsv'
with open(DOMAIN_TAXON_GROUPS_ORIG) as r, open(DOMAIN_TAXON_GROUPS, 'w') as w:
    for line in r:
        dom, tax, group = line.strip().split('\t')
        if dom in domain_to_terms:
            print(dom, tax, group, sep='\t', file=w)

In [5]:
# READ MANUALLY MODIFIED TAXON GROUP ASSIGNMENTS (added Anomalous group)
DOMAIN_TAXON_GROUPS_WITH_ANOMALOUS = DIR / 'domains_taxons_groups_with_anomalous.tsv'

domain_to_group = {}
group_to_domains = defaultdict(list)
with open(DOMAIN_TAXON_GROUPS_WITH_ANOMALOUS) as r:
    for line in r:
        dom, tax, group = line.strip().split('\t')
        domain_to_group[dom] = group
        group_to_domains[group].append(dom)
group_to_domains = dict(group_to_domains)
groups = sorted(group_to_domains.keys())

In [6]:
def add_go_description(df, term_to_description):
    df['Type'] = [SHORT_TYPES[term_to_description[t][0]] for t in df.index]
    df['Name'] = [term_to_description[t][1] for t in df.index]
    df['Definition'] = [term_to_description[t][2] for t in df.index]

In [13]:
# CREATE PANDAS DATAFRAME
freq_table = pd.DataFrame(index=terms, columns=groups, data=0)
for dom, ts in sorted(domain_to_terms.items()):
    for term in ts:
        group = domain_to_group[dom]
        freq_table[group].loc[term] += 1
totals = freq_table.sum(axis=1)
freq_table['Total'] = totals
add_go_description(freq_table, term_to_description)
freq_table.sort_values(by='Total', ascending=False, inplace=True)
freq_table.to_csv(DIR / 'GO_groups.csv')

In [14]:
# CREATE PANDAS DATAFRAME - ANOMALOUS
group_here = 'Anom'
terms_here = [term for term in terms if any(domain_to_group[dom]==group_here for dom in term_to_domains[term])]
overview_table = pd.DataFrame(index=terms_here, columns=group_to_domains[group_here], data=0)
for dom, ts in sorted(domain_to_terms.items()):
    if domain_to_group[dom] == group_here:
        for term in ts:
            group = domain_to_group[dom]
            overview_table.loc[term, dom] += 1
overview_table['Total'] = overview_table.sum(axis=1)
add_go_description(overview_table, term_to_description)
overview_table.sort_values(by='Total', ascending=False, inplace=True)
overview_table.rename(columns=(lambda col: f'{domain_to_uniprot_name[col]}/{col}' if col in domain_to_uniprot_name else col), inplace=True)
overview_table.to_csv(DIR / 'GO_anomalous.csv')