In [6]:
from danrerlib import enrichment
from danrerlib import mapping
import pandas as pd

In [2]:
NCBI_ID = 'NCBI Gene ID'

In [3]:
gene_universe = pd.read_csv('data/test_data/example_diff_express_data.txt', sep = '\t')
gene_universe.head(5)

Unnamed: 0,NCBI Gene ID,PValue,logFC
0,100000006,0.792615,0.115009
1,100000044,0.015286,0.803879
2,100000085,0.264762,0.26736
3,100000101,0.279948,-0.281195
4,100000125,0.194151,0.154282


In [5]:
required_columns = ['NCBI Gene ID', 'PValue', 'log2FC']
existing_columns = gene_universe.columns
existing_columns[0]

'NCBI Gene ID'

In [8]:
new_df = mapping.add_mapped_column(gene_universe, 'NCBI Gene ID', 'ZFIN ID', keep_old_ids=False)

In [11]:
new_df = new_df.rename(columns={'PValue': 'p'})
new_df

Unnamed: 0,ZFIN ID,p,logFC
0,ZDB-GENE-030131-5654,7.926150e-01,0.115009
1,ZDB-GENE-121214-60,1.528565e-02,0.803879
2,ZDB-GENE-030131-1312,2.647625e-01,0.267360
3,ZDB-GENE-050208-155,2.799478e-01,-0.281195
4,ZDB-GENE-060503-666,1.941513e-01,0.154282
...,...,...,...
5460,ZDB-GENE-010412-1,5.799081e-01,-0.260927
5461,ZDB-GENE-010426-7,5.629701e-01,-0.122201
5462,ZDB-GENE-010501-5,8.448122e-01,-0.028498
5463,ZDB-GENE-010509-2,5.418643e-01,0.116959


In [12]:
gene_universe = new_df
specified_gene_id_type = 'ZFIN ID'

In [13]:
required_columns = ['NCBI Gene ID', 'PValue', 'log2FC']
existing_columns = gene_universe.columns

gene_id_type_from_data = existing_columns[0]
if specified_gene_id_type != gene_id_type_from_data:
    gene_universe = gene_universe.rename(columns={gene_id_type_from_data: specified_gene_id_type})

if specified_gene_id_type != required_columns[1]:
    gene_universe = mapping.add_mapped_column(gene_universe, specified_gene_id_type, required_columns[0], keep_old_ids=False)

missing_columns = [col for col in required_columns if col not in existing_columns]

# If any required columns are missing, rename existing columns to required names
# THIS ASSUMES THE FIRST COLUMN IS GENE ID, SECOND COLUMN PVAL, THIRD COLUMN LOG2FC
if missing_columns:
    # Create a dictionary to map existing column names to the required names
    column_mapping = {col: required_columns[i] for i, col in enumerate(existing_columns)}

    # Rename the columns based on the mapping
    gene_universe = gene_universe.rename(columns=column_mapping)
gene_universe

Unnamed: 0,NCBI Gene ID,PValue,log2FC
0,100000006,7.926150e-01,0.115009
1,100000044,1.528565e-02,0.803879
2,100000085,2.647625e-01,0.267360
3,100000101,2.799478e-01,-0.281195
4,100000125,1.941513e-01,0.154282
...,...,...,...
5460,83415,5.799081e-01,-0.260927
5461,83775,5.629701e-01,-0.122201
5462,83910,8.448122e-01,-0.028498
5463,84037,5.418643e-01,0.116959


In [5]:
background_gene_set = pd.read_csv('data/test_data/background_gene_list.txt', sep = '\t')
background_gene_set.head(5)

Unnamed: 0,NCBI Gene ID
0,100000006
1,100000044
2,100000085
3,100000101
4,100000125


In [7]:
gene_set = pd.read_csv('data/test_data/dre04910.txt', sep = '\t')
gene_set.head(5)

Unnamed: 0,NCBI Gene ID
0,100000252
1,100000750
2,100001198
3,100001260
4,100002225


In [14]:
database = 'KEGG Pathway'
database.split()[0]

'KEGG'

In [10]:
# Merge gene_universe and background_gene_set to get information on all genes in the universe
gene_universe_all = pd.merge(gene_universe, background_gene_set, on=NCBI_ID, how='right', suffixes=('_universe', '_background'))
gene_universe_all.head(5)

Unnamed: 0,NCBI Gene ID,PValue,logFC
0,100000006,0.792615,0.115009
1,100000044,0.015286,0.803879
2,100000085,0.264762,0.26736
3,100000101,0.279948,-0.281195
4,100000125,0.194151,0.154282


In [11]:
merged_df = pd.merge(gene_universe_all, gene_set, on=NCBI_ID, how='right', suffixes=('_universe', '_set'))
merged_df.head(5)

Unnamed: 0,NCBI Gene ID,PValue,logFC
0,100000252,,
1,100000750,,
2,100001198,,
3,100001260,0.017629,-0.411971
4,100002225,0.002524,0.714442


In [None]:
# Count the number of genes in the gene universe
total_genes = len(gene_universe_all)

In [13]:
common_genes_in_universe = set(gene_universe_all[NCBI_ID]).intersection(set(background_gene_set[NCBI_ID]))
total_genes = len(common_genes_in_universe)

In [14]:
total_genes

5464

In [None]:
total_number_of_genes_in_universe = len(gene_universe)

In [34]:
total_number_of_genes_in_universe = len(gene_universe)
sig_genes_set = set(gene_universe[NCBI_ID][gene_universe['PValue'].lt(0.05)])
gene_set = gene_set[gene_set[NCBI_ID].isin(gene_universe[NCBI_ID])]
genes_in_set = set(gene_set[NCBI_ID])

In [35]:
# Number of genes that are both in the gene set and significantly expressed
a = len(sig_genes_set.intersection(genes_in_set))

# Number of genes in the gene set but not significantly expressed
b = len(genes_in_set.difference(sig_genes_set))

# Number of genes that are significantly expressed but not in the gene set
c = len(sig_genes_set.difference(genes_in_set))

# Number of genes neither in the gene set nor significantly expressed
d = total_number_of_genes_in_universe - (a + b + c)

In [38]:
d

4414

In [27]:
from scipy.stats import fisher_exact


In [28]:
odds_ratio, p_value = fisher_exact([[a, b], [c, d]], alternative='greater')


In [30]:
p_value

0.9999999996258014

In [15]:
path = '../src/danrerlib/database/KEGG/empty_disease_ids_V1.txt'
empty_disease = pd.read_csv(path, sep = '\t')

In [19]:
empty_disease = empty_disease.drop(columns={'Unnamed: 0'})


KeyError: "['Unnamed: 0'] not found in axis"

In [20]:
empty_disease.to_csv(path, sep = '\t', index = False)

In [21]:
path = '../src/danrerlib/database/KEGG/disease_ids_V1.txt'
full_disease = pd.read_csv(path, sep = '\t')
full_disease

Unnamed: 0,Disease ID,Disease Description
0,H00001,B-cell acute lymphoblastic leukemia; B-cell ac...
1,H00002,T-cell acute lymphoblastic leukemia; T-cell ac...
2,H00003,Acute myeloid leukemia
3,H00004,Chronic myeloid leukemia
4,H00005,Chronic lymphocytic leukemia
...,...,...
2641,H02649,Autosomal dominant slowed nerve conduction vel...
2642,H02650,Menke-Hennekam syndrome
2643,H02651,Lessel-Kreienkamp syndrome
2644,H02652,"Macrocephaly, acquired, with impaired intellec..."


In [23]:
full_disease = full_disease[~full_disease['Disease ID'].isin(empty_disease['Disease ID'])]

In [25]:
full_disease.to_csv('../src/danrerlib/database/KEGG/disease_ids_valid_V1.txt', sep = '\t', index = False)

In [22]:
# Assuming 'Disease ID' is the common column between the two DataFrames

# Sample data for empty_disease
empty_data = {'Disease ID': [1, 3, 5]}
empty_disease2 = pd.DataFrame(empty_data)

# Sample data for full_disease
full_data = {'Disease ID': [1, 2, 3, 4, 5],
             'Disease Name': ['A', 'B', 'C', 'D', 'E']}
full_disease2 = pd.DataFrame(full_data)

# Display the original full_disease DataFrame
print("Original full_disease DataFrame:")
print(full_disease2)

# Remove rows from full_disease based on 'Disease ID' in empty_disease
full_disease2 = full_disease2[~full_disease2['Disease ID'].isin(empty_disease2['Disease ID'])]

# Display the modified full_disease DataFrame
print("\nfull_disease DataFrame after removal:")
print(full_disease2)

Original full_disease DataFrame:
   Disease ID Disease Name
0           1            A
1           2            B
2           3            C
3           4            D
4           5            E

full_disease DataFrame after removal:
   Disease ID Disease Name
1           2            B
3           4            D
