In [1]:
# IMPORT PYTHON PACKAGES
# ----------------------

# makes the notebook cell print all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# path packages
import sys
from pathlib import Path
# data processing packages
import pandas as pd
# SET UP MY LOCAL PACKAGE
# -----------------------
# this step is only needed because the local package has not been released through pip

cwd = Path().absolute()

package_folder = cwd / Path('../src/danRerlib')
sys.path.append(str(package_folder))
import mapping, utils

# SET UP DATA DIRECTORY
# ---------------------
test_data_dir = cwd / Path('data/test/data/')
out_data_dir = cwd / Path('data/out_data/')

# note: I am using the Path package to take care of any operating
#       system differences for users of this tutorial

In [2]:
database_dir = cwd / Path('../src/danRerlib/database/')
human_dir = database_dir / Path('KEGG/hsa/')

In [3]:
p = human_dir / Path('hsa01100.txt')
data = pd.read_csv(p, sep='\t')
data

Unnamed: 0,Human NCBI Gene ID
0,10
1,100
2,10005
3,10007
4,100137049
...,...
1537,98
1538,9869
1539,9896
1540,9942


In [4]:
out = mapping.convert_to_zebrafish(data, 'NCBI Gene ID', keep_mapping=True)

In [5]:
list = ['10135', '10390']
m = mapping.convert_to_zebrafish(list, 'ZFIN ID')
print(m)
l=mapping.convert_ids(m, 'ZFIN ID', 'NCBI Gene ID')

0    ZDB-GENE-030131-2931
1      ZDB-GENE-081031-55
2      ZDB-GENE-081031-61
3       ZDB-GENE-110420-1
4       ZDB-GENE-200107-1
5       ZDB-GENE-200107-2
6      ZDB-GENE-071004-33
7      ZDB-GENE-081105-47
Name: ZFIN ID, dtype: object


In [6]:
m.to_list()

['ZDB-GENE-030131-2931',
 'ZDB-GENE-081031-55',
 'ZDB-GENE-081031-61',
 'ZDB-GENE-110420-1',
 'ZDB-GENE-200107-1',
 'ZDB-GENE-200107-2',
 'ZDB-GENE-071004-33',
 'ZDB-GENE-081105-47']

In [7]:
l

0       324211
1       570720
2    100149018
3    100148280
4    100330160
5       557816
Name: NCBI Gene ID, dtype: object

In [8]:
l.dropna()

0       324211
1       570720
2    100149018
3    100148280
4    100330160
5       557816
Name: NCBI Gene ID, dtype: object

In [9]:
import numpy as np
s=l
s = s.replace('nan', np.nan)
s.dropna()

0       324211
1       570720
2    100149018
3    100148280
4    100330160
5       557816
Name: NCBI Gene ID, dtype: object

out

In [10]:
rows_with_nan_entry = out[out['NCBI Gene ID'] == 'nan']


In [11]:
rows_with_nan_entry

Unnamed: 0,Human NCBI Gene ID,NCBI Gene ID


In [12]:
out

Unnamed: 0,Human NCBI Gene ID,NCBI Gene ID
0,100,436919
1,10005,450052
2,10007,550565
3,10020,393857
4,10026,415239
...,...,...
1483,9869,768131
1484,9896,100334605
1485,9896,100329711
1486,9942,393350


In [13]:
indices_with_nan = out[out.isna()].index.tolist()
len(indices_with_nan)

1488

In [14]:
rows_with_nan = out[out.isna().any(axis=1)]


In [15]:
rows_with_nan

Unnamed: 0,Human NCBI Gene ID,NCBI Gene ID


In [16]:
def convert_to_str(value):
    if value == 'nan':
        return np.nan  # Preserve NaN values
    else:
        return str(value)

In [17]:
NCBI_ID = 'NCBI Gene ID'
file_path = database_dir / Path('master_gene_mapping_file_V1.txt')
master_mapping = pd.read_csv(file_path, sep='\t')
master_mapping

Unnamed: 0,ZFIN ID,Symbol,NCBI Gene ID,Ensembl ID
0,ZDB-BR-071004-117,br.line.zfl2-1,100126124,
1,ZDB-GENE-000112-47,ppardb,30754,ENSDARG00000009473
2,ZDB-GENE-000125-12,igfbp2a,794176,ENSDARG00000052470
3,ZDB-GENE-000125-4,dlc,30120,ENSDARG00000002336
4,ZDB-GENE-000128-11,dbx1b,30416,ENSDARG00000001859
...,...,...,...,...
30298,ZDB-SNORNAG-201111-1,snord118a,,ENSDARG00000082850
30299,ZDB-SNORNAG-201111-2,snord118b.1,,ENSDARG00000115663
30300,ZDB-SNORNAG-201111-3,snord118b.2,,ENSDARG00000109514
30301,ZDB-SNORNAG-201111-4,snord118b.3,,ENSDARG00000081379


In [18]:
type(master_mapping[NCBI_ID].values[0])

str

In [19]:
master_mapping[NCBI_ID] = master_mapping[NCBI_ID].astype(str)
master_mapping

Unnamed: 0,ZFIN ID,Symbol,NCBI Gene ID,Ensembl ID
0,ZDB-BR-071004-117,br.line.zfl2-1,100126124,
1,ZDB-GENE-000112-47,ppardb,30754,ENSDARG00000009473
2,ZDB-GENE-000125-12,igfbp2a,794176,ENSDARG00000052470
3,ZDB-GENE-000125-4,dlc,30120,ENSDARG00000002336
4,ZDB-GENE-000128-11,dbx1b,30416,ENSDARG00000001859
...,...,...,...,...
30298,ZDB-SNORNAG-201111-1,snord118a,,ENSDARG00000082850
30299,ZDB-SNORNAG-201111-2,snord118b.1,,ENSDARG00000115663
30300,ZDB-SNORNAG-201111-3,snord118b.2,,ENSDARG00000109514
30301,ZDB-SNORNAG-201111-4,snord118b.3,,ENSDARG00000081379
