In [1]:
# IMPORT PYTHON PACKAGES
# ----------------------

# makes the notebook cell print all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# path packages
import sys
from pathlib import Path
# data processing packages
import pandas as pd
# SET UP MY LOCAL PACKAGE
# -----------------------
# this step is only needed because the local package has not been released through pip

cwd = Path().absolute()

package_folder = cwd / Path('../src/danRerLib')
sys.path.append(str(package_folder))
import mapping, utils, KEGG, GO, settings

# SET UP DATA DIRECTORY
# ---------------------
test_data_dir = cwd / Path('data/test/data/')
out_data_dir = cwd / Path('data/out_data/')

# note: I am using the Path package to take care of any operating
#       system differences for users of this tutorial

In [11]:
# get 10 random ZFIN ids:

zfin_path = '../src/danRerlib/database/raw_data/zfin_to_ensembl_V1.txt'

df = pd.read_csv(zfin_path, sep = '\t')
trash = df['ZFIN ID'].sample(10).to_list()

random_entries = ['ZDB-GENE-040426-1218',
 'ZDB-GENE-030616-582',
 'ZDB-GENE-050327-38',
 'ZDB-GENE-050302-29',
 'ZDB-MIRNAG-090929-162',
 'ZDB-GENE-041111-305',
 'ZDB-GENE-040426-1159',
 'ZDB-GENE-050227-21',
 'ZDB-GENE-060526-320',
 'ZDB-GENE-040718-307']

filtered_df = df[df['ZFIN ID'].isin(random_entries)][['Ensembl ID']]
filtered_df['Ensembl ID'].to_list()


['ENSDARG00000054292',
 'ENSDARG00000038780',
 'ENSDARG00000002406',
 'ENSDARG00000089310',
 'ENSDARG00000100596',
 'ENSDARG00000030881',
 'ENSDARG00000063570',
 'ENSDARG00000099828',
 'ENSDARG00000094489',
 'ENSDARG00000080838']

In [2]:
zfin_ids = ['ZDB-GENE-040426-1218',
    'ZDB-GENE-030616-582',
    'ZDB-GENE-050327-38',
    'ZDB-GENE-050302-29',
    'ZDB-MIRNAG-090929-162',
    'ZDB-GENE-041111-305',
    'ZDB-GENE-040426-1159',
    'ZDB-GENE-050227-21',
    'ZDB-GENE-060526-320',
    'ZDB-GENE-040718-307']

ens_ids = ['ENSDARG00000054292',
    'ENSDARG00000038780',
    'ENSDARG00000002406',
    'ENSDARG00000089310',
    'ENSDARG00000100596',
    'ENSDARG00000030881',
    'ENSDARG00000063570',
    'ENSDARG00000099828',
    'ENSDARG00000094489',
    'ENSDARG00000080838']

out_ids = mapping.convert_ids(zfin_ids, settings.ZFIN_ID, settings.ENS_ID, out_format=list)
out_ids

['ENSDARG00000054292',
 'ENSDARG00000038780',
 'ENSDARG00000002406',
 'ENSDARG00000089310',
 'ENSDARG00000100596',
 'ENSDARG00000030881',
 'ENSDARG00000063570',
 'ENSDARG00000099828',
 'ENSDARG00000094489',
 'ENSDARG00000080838']

In [8]:
# create a function to determine how many GO concepts there are
# It seems that the GO file does not have organism specified

# I am curious to know if the ZFIN file has duplicates


In [60]:
# download file into workspace...
# note this is in tsv format already
url = 'https://current.geneontology.org/annotations/zfin.gaf.gz'
column_names = ['Database Designation',
                'Marker ID',
                'Gene Symbol',
                'Qualifiers',
                'GO Term ID',
                'Reference ID',
                'GO Evidence Code',
                'Inferred From',
                'Ontology',
                'Marker Name',
                'Marker Synonyms',
                'Marker Type',
                'Taxon',
                'Modification Date',
                'Assigned By',
                'Annotation Extension', 
                'Gene Product Form ID']
df = pd.read_csv(url, sep='\t', comment='!', names=column_names, low_memory=False)
desired_columns = ['Marker ID', 'GO Term ID', 'Ontology']
df = df.loc[:, desired_columns].drop_duplicates()
df.head(3)

Unnamed: 0,Database Designation,Marker ID,Gene Symbol,Qualifiers,GO Term ID,Reference ID,GO Evidence Code,Inferred From,Ontology,Marker Name,Marker Synonyms,Marker Type,Taxon,Modification Date,Assigned By,Annotation Extension,Gene Product Form ID
0,ZFIN,ZDB-GENE-080212-6,zgc:173705,enables,GO:0046872,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0479,F,zgc:173705,,protein_coding_gene,taxon:7955,20220830,ZFIN,,
1,ZFIN,ZDB-GENE-040801-101,zgc:101048,enables,GO:0046872,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0479,F,zgc:101048,,protein_coding_gene,taxon:7955,20220830,ZFIN,,
2,ZFIN,ZDB-GENE-081028-59,si:ch211-165e15.1,enables,GO:0003779,ZFIN:ZDB-PUB-020723-1,IEA,UniProtKB-KW:KW-0009,F,si:ch211-165e15.1,,protein_coding_gene,taxon:7955,20220830,ZFIN,,


In [56]:
num_unique_terms = df['GO Term ID'].nunique()
print("Number of unique GO Term IDs:", num_unique_terms)

Number of unique GO Term IDs: 10420


In [72]:
def _get_ontology(namespace):
    if (namespace == 'biological_process'
        or namespace == 'Process') :
        return 'P'
    elif (namespace == 'molecular_function'
          or namespace == 'Function'):
        return 'F'
    elif (namespace == 'cellular_component'
          or namespace == 'Component'):
        return 'C'
    else:
        return None

In [77]:
# lets try the ncbi set
url = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz'
df = pd.read_csv(url, sep='\t', compression='gzip')
df = df[df['#tax_id']==9606]
desired_columns = ['GeneID', 'GO_ID', 'Category']
df = df.loc[:, desired_columns].drop_duplicates()
new_column_names = {'GeneID': 'Human NCBI Gene ID',
                    'GO_ID': 'GO ID',
                    'Category' : 'Orthology'}
df.rename(columns=new_column_names, inplace=True)
df['Orthology'] = df['Orthology'].apply(_get_ontology)
df.shape


(294559, 3)

In [62]:
df.shape

(526560, 8)

In [63]:
df = df.drop_duplicates()
df.shape

(526560, 8)

In [64]:
# Assuming 'df' is your DataFrame
counts = df['#tax_id'].value_counts()

count_7955 = counts.get(7955, 0)
count_9606 = counts.get(9606, 0)

print("Count of #tax_id = 7955:", count_7955)
print("Count of #tax_id = 9606:", count_9606)

Count of #tax_id = 7955: 182632
Count of #tax_id = 9606: 343928


In [66]:

# Assuming 'df' is your DataFrame
unique_go_terms_7955 = df[df['#tax_id'] == 7955]['GO_ID'].nunique()

print("Number of unique GO_ID terms where #tax_id = 7955:", unique_go_terms_7955)

Number of unique GO_ID terms where #tax_id = 7955: 10334


split zebrafish and human taxid

In [None]:
zfish = 

In [None]:
# make zebrafish GO database:

# download file into workspace...
# note this is in tsv format already
url = 'https://current.geneontology.org/annotations/zfin.gaf.gz'
column_names = ['Database Designation',
                'Marker ID',
                'Gene Symbol',
                'Qualifiers',
                'GO Term ID',
                'Reference ID',
                'GO Evidence Code',
                'Inferred From',
                'Ontology',
                'Marker Name',
                'Marker Synonyms',
                'Marker Type',
                'Taxon',
                'Modification Date',
                'Assigned By',
                'Annotation Extension', 
                'Gene Product Form ID']
df = pd.read_csv(url, sep='\t', comment='!', names=column_names, low_memory=False)
desired_columns = ['Marker ID', 'GO Term ID', 'Ontology']
df = df.loc[:, desired_columns].drop_duplicates()
new_column_names = {'Marker ID': 'ZFIN ID',
                    'GO Term ID': 'GO ID'}
df.rename(columns=new_column_names, inplace=True)


In [79]:
# read in necessary files

main = '../src/danRerlib/database/GO/GO_ids_V1.txt'
zfish = '../src/danRerlib/database/GO/GO_dre_V1.txt'
human = '../src/danRerlib/database/GO/GO_hsa_V1.txt'

df = pd.read_csv(main, sep='\t')
df_zfish = pd.read_csv(zfish, sep='\t')
df_human = pd.read_csv(human, sep='\t')

df['exists_hsa'] = df['GO ID'].isin(df_human['GO ID'])
df['exists_dre'] = df['GO ID'].isin(df_zfish['GO ID'])
df


Unnamed: 0,GO ID,GO Name,Ontology,exists_hsa,exists_dre
0,GO:0000001,mitochondrion inheritance,P,False,False
1,GO:0000002,mitochondrial genome maintenance,P,True,True
2,GO:0000003,reproduction,P,True,True
3,GO:0000005,obsolete ribosomal chaperone activity,F,False,False
4,GO:0000006,high-affinity zinc transmembrane transporter a...,F,False,False
...,...,...,...,...,...
47552,GO:2001312,lysobisphosphatidic acid biosynthetic process,P,False,False
47553,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False
47554,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False
47555,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,P,False,False


In [3]:
GO_PATH_dre = '../src/danRerlib/database/GO/GO_dre_V1.txt'
df = pd.read_csv(GO_PATH_dre, sep = '\t')
mapped = mapping.add_mapped_column(df, 'ZFIN ID', 'NCBI Gene ID', keep_old_ids=False, drop_na=True)
mapped

Unnamed: 0,NCBI Gene ID,GO ID,Ontology
0,103908622,GO:0046872,F
1,103908622,GO:0000981,F
2,103908622,GO:0000977,F
3,103908622,GO:0006357,P
4,103908622,GO:0005634,C
...,...,...,...
176125,100216528,GO:0048514,P
176130,100033591,GO:0048514,P
176149,100033691,GO:0035195,P
176150,100033691,GO:0033339,P


In [4]:
new_mapped = mapping.add_mapped_column(mapped, 'NCBI Gene ID', 'ZFIN ID', keep_old_ids=False, drop_na=True)
new_mapped

Unnamed: 0,ZFIN ID,GO ID,Ontology
0,ZDB-GENE-080212-6,GO:0046872,F
1,ZDB-GENE-080212-6,GO:0000981,F
2,ZDB-GENE-080212-6,GO:0000977,F
3,ZDB-GENE-080212-6,GO:0006357,P
4,ZDB-GENE-080212-6,GO:0005634,C
...,...,...,...
151184,ZDB-MIRNAG-081209-10,GO:0048514,P
151185,ZDB-MIRNAG-081209-8,GO:0048514,P
151186,ZDB-MIRNAG-081212-1,GO:0035195,P
151187,ZDB-MIRNAG-081212-1,GO:0033339,P


In [14]:
concept_id = 'GO:0033339'
GO_PATH_dre = '../src/danRerlib/database/GO/GO_dre_V1.txt'
GO_PATH_dreM = '../src/danRerlib/database/GO/GO_dreM_V1.txt'
GO_PATH_hsa = '../src/danRerlib/database/GO/GO_hsa_V1.txt'

GO_BASIC_URL = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
GO_ZFIN_URL = 'https://current.geneontology.org/annotations/zfin.gaf.gz'
GO_NCBI_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz'

HUMAN_ID = 'Human NCBI Gene ID'
ZFIN_ID = 'ZFIN ID'

def get_genes_in_GO_concept(concept_id, organism, gene_id_type = None):
    if organism == 'hsa':
        path = GO_PATH_hsa
        gene_id_type_from_db = HUMAN_ID
    elif organism == 'dre':
        path = GO_PATH_dre
        gene_id_type_from_db = ZFIN_ID
    elif organism == 'dreM':
        path = GO_PATH_dreM
        gene_id_type_from_db = ZFIN_ID
    else:
        raise ValueError('Invalid organism.')
    
    df = pd.read_csv(path, sep = '\t')
    filtered_df = df[df['GO ID'] == concept_id]

    # Extract the 'ZFIN ID' column as a Series
    gene_ids_series = filtered_df[gene_id_type_from_db]
    if gene_id_type and (organism != 'hsa'):
        gene_ids_series = mapping.convert_ids(gene_ids_series, 
                            gene_id_type_from_db, gene_id_type)
    return gene_ids_series

out = get_genes_in_GO_concept(concept_id, 'dre')
out.head(3)
out.shape

14893       ZDB-GENE-980526-72
154216    ZDB-GENE-030131-3197
154347      ZDB-GENE-060503-14
Name: ZFIN ID, dtype: object

(53,)

In [19]:
# check if GO ID exists
GO_IDS_PATH = '../src/danRerlib/database/GO/GO_ids_V1.txt'
df = pd.read_csv(GO_IDS_PATH, sep = '\t')

def check_if_id_exists(concept_id, organism):
    df = pd.read_csv(GO_IDS_PATH, sep = '\t')
    filtered_df = df[df['GO ID'] == concept_id]
    if organism == 'hsa':
        target_column = 'exists_hsa'
    elif organism == 'dre':
        target_column = 'exists_dre'
    elif organism == 'dreM':
        target_column = 'exists_hsa'
    else:
        raise ValueError('Invalid Organism')
    
    return filtered_df[target_column].iloc[0]

check_if_id_exists(concept_id, 'dreM')

False

In [27]:



def is_numeric(value):
    if type(value) == str:
        return value.isnumeric()
    else:
        return isinstance(value, (int, float))
    
def check_id_format(id):
    if is_numeric(id):
        length_of_numeric = len(str(id))
        if length_of_numeric < 7:
            zeros_needed = 7-length_of_numeric
            id = 'GO:' + zeros_needed*'0' + str(id)
        elif length_of_numeric == 7:
            id = 'GO:' + str(id)
        else:
            raise ValueError('Unknown Gene Ontology ID')
    else:
        if len(id) != 10:
            raise ValueError('GO ID should be length 10')
        if id[0:3] != 'GO:':
            raise ValueError('The prefix should be \'GO:\'')
    return id


In [32]:
id = '0000337'
check_id_format(id)

'GO:0000337'

In [None]:
GO:0000015