In [7]:
# IMPORT PYTHON PACKAGES
# ----------------------

# makes the notebook cell print all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# path packages
import sys
from pathlib import Path
# data processing packages
import pandas as pd
# SET UP MY LOCAL PACKAGE
# -----------------------
# this step is only needed because the local package has not been released through pip

cwd = Path().absolute()

package_folder = cwd / Path('../src/danRerlib')
sys.path.append(str(package_folder))
import mapping, utils, KEGG, GO

# SET UP DATA DIRECTORY
# ---------------------
test_data_dir = cwd / Path('data/test/data/')
out_data_dir = cwd / Path('data/out_data/')

# note: I am using the Path package to take care of any operating
#       system differences for users of this tutorial

In [8]:
# create a function to determine how many GO concepts there are
# It seems that the GO file does not have organism specified

# I am curious to know if the ZFIN file has duplicates


In [60]:
# download file into workspace...
# note this is in tsv format already
url = 'https://current.geneontology.org/annotations/zfin.gaf.gz'
column_names = ['Database Designation',
                'Marker ID',
                'Gene Symbol',
                'Qualifiers',
                'GO Term ID',
                'Reference ID',
                'GO Evidence Code',
                'Inferred From',
                'Ontology',
                'Marker Name',
                'Marker Synonyms',
                'Marker Type',
                'Taxon',
                'Modification Date',
                'Assigned By',
                'Annotation Extension', 
                'Gene Product Form ID']
df = pd.read_csv(url, sep='\t', comment='!', names=column_names, low_memory=False)
desired_columns = ['Marker ID', 'GO Term ID', 'Ontology']
df = df.loc[:, desired_columns].drop_duplicates()
df.head(3)

Unnamed: 0,Database Designation,Marker ID,Gene Symbol,Qualifiers,GO Term ID,Reference ID,GO Evidence Code,Inferred From,Ontology,Marker Name,Marker Synonyms,Marker Type,Taxon,Modification Date,Assigned By,Annotation Extension,Gene Product Form ID
0,ZFIN,ZDB-GENE-080212-6,zgc:173705,enables,GO:0046872,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0479,F,zgc:173705,,protein_coding_gene,taxon:7955,20220830,ZFIN,,
1,ZFIN,ZDB-GENE-040801-101,zgc:101048,enables,GO:0046872,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0479,F,zgc:101048,,protein_coding_gene,taxon:7955,20220830,ZFIN,,
2,ZFIN,ZDB-GENE-081028-59,si:ch211-165e15.1,enables,GO:0003779,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0009,F,si:ch211-165e15.1,,protein_coding_gene,taxon:7955,20220830,ZFIN,,


In [56]:
num_unique_terms = df['GO Term ID'].nunique()
print("Number of unique GO Term IDs:", num_unique_terms)

Number of unique GO Term IDs: 10420


In [72]:
def _get_ontology(namespace):
    if (namespace == 'biological_process'
        or namespace == 'Process') :
        return 'P'
    elif (namespace == 'molecular_function'
          or namespace == 'Function'):
        return 'F'
    elif (namespace == 'cellular_component'
          or namespace == 'Component'):
        return 'C'
    else:
        return None

In [77]:
# lets try the ncbi set
url = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz'
df = pd.read_csv(url, sep='\t', compression='gzip')
df = df[df['#tax_id']==9606]
desired_columns = ['GeneID', 'GO_ID', 'Category']
df = df.loc[:, desired_columns].drop_duplicates()
new_column_names = {'GeneID': 'Human NCBI Gene ID',
                    'GO_ID': 'GO ID',
                    'Category' : 'Orthology'}
df.rename(columns=new_column_names, inplace=True)
df['Orthology'] = df['Orthology'].apply(_get_ontology)
df.shape


(294559, 3)

In [62]:
df.shape

(526560, 8)

In [63]:
df = df.drop_duplicates()
df.shape

(526560, 8)

In [64]:
# Assuming 'df' is your DataFrame
counts = df['#tax_id'].value_counts()

count_7955 = counts.get(7955, 0)
count_9606 = counts.get(9606, 0)

print("Count of #tax_id = 7955:", count_7955)
print("Count of #tax_id = 9606:", count_9606)

Count of #tax_id = 7955: 182632
Count of #tax_id = 9606: 343928


In [66]:

# Assuming 'df' is your DataFrame
unique_go_terms_7955 = df[df['#tax_id'] == 7955]['GO_ID'].nunique()

print("Number of unique GO_ID terms where #tax_id = 7955:", unique_go_terms_7955)

Number of unique GO_ID terms where #tax_id = 7955: 10334


split zebrafish and human taxid

In [None]:
zfish = 

In [None]:
# make zebrafish GO database:

# download file into workspace...
# note this is in tsv format already
url = 'https://current.geneontology.org/annotations/zfin.gaf.gz'
column_names = ['Database Designation',
                'Marker ID',
                'Gene Symbol',
                'Qualifiers',
                'GO Term ID',
                'Reference ID',
                'GO Evidence Code',
                'Inferred From',
                'Ontology',
                'Marker Name',
                'Marker Synonyms',
                'Marker Type',
                'Taxon',
                'Modification Date',
                'Assigned By',
                'Annotation Extension', 
                'Gene Product Form ID']
df = pd.read_csv(url, sep='\t', comment='!', names=column_names, low_memory=False)
desired_columns = ['Marker ID', 'GO Term ID', 'Ontology']
df = df.loc[:, desired_columns].drop_duplicates()
new_column_names = {'Marker ID': 'ZFIN ID',
                    'GO Term ID': 'GO ID'}
df.rename(columns=new_column_names, inplace=True)


In [79]:
# read in necessary files

main = '../src/danRerlib/database/GO/GO_ids_V1.txt'
zfish = '../src/danRerlib/database/GO/GO_dre_V1.txt'
human = '../src/danRerlib/database/GO/GO_hsa_V1.txt'

df = pd.read_csv(main, sep='\t')
df_zfish = pd.read_csv(zfish, sep='\t')
df_human = pd.read_csv(human, sep='\t')

df['exists_hsa'] = df['GO ID'].isin(df_human['GO ID'])
df['exists_dre'] = df['GO ID'].isin(df_zfish['GO ID'])
df


Unnamed: 0,GO ID,GO Name,Ontology,exists_hsa,exists_dre
0,GO:0000001,mitochondrion inheritance,P,False,False
1,GO:0000002,mitochondrial genome maintenance,P,True,True
2,GO:0000003,reproduction,P,True,True
3,GO:0000005,obsolete ribosomal chaperone activity,F,False,False
4,GO:0000006,high-affinity zinc transmembrane transporter a...,F,False,False
...,...,...,...,...,...
47552,GO:2001312,lysobisphosphatidic acid biosynthetic process,P,False,False
47553,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False
47554,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False
47555,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False
