In [2]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, parent_dir)

import pandas as pd 
import numpy as np 
import sklearn
from dir import *
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.model_selection import train_test_split
from VAE_model import *
from VAE_model_2 import *
from VAE_model_single import *
from training import *
from extras import *
import mantel
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceMatrix
from scipy.spatial.distance import squareform
from sklearn.metrics import pairwise_distances
from collections import defaultdict
from scipy.stats import pearsonr, spearmanr
import random
# from skbio.stats.distance import mantel
from sklearn.decomposition import PCA
import pingouin as pg
from sklearn.cluster import KMeans
plt.style.use('ggplot')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
large_data = pd.read_csv(TEN_K_DATASET, index_col=[0], header=[0])

In [4]:
phylogroup_data = pd.read_csv(TEN_K_DATASET_PHYLOGROUPS, index_col=[0], header=[0])

In [5]:
data_without_lineage = large_data.drop(index=['Lineage'])
merged_df = pd.merge(data_without_lineage.transpose(), phylogroup_data, how='inner', left_index=True, right_on='ID')
print("Dataframe shape: ", merged_df.shape)

Dataframe shape:  (1663, 55040)


In [6]:
merged_df

Unnamed: 0_level_0,group_5501,group_4783,group_4456,group_6055,group_2859,group_3832,group_4223,group_2401,group_1236,group_3245,...,group_2066_1,mukF_1,group_955_2,group_1382_5,group_2149_7,group_2067_4,group_2068_8,group_2069_3,group_2071_4,Phylogroup
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AIAW00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
AIBY00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
AIFN00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
NC_002655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
NC_002695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
JHRE00000000,0,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,B1
JHRP00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B1
JHRY00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B1
JHSN00000000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B1


In [7]:
merged_df.filter(regex='group').columns

Index(['group_5501', 'group_4783', 'group_4456', 'group_6055', 'group_2859',
       'group_3832', 'group_4223', 'group_2401', 'group_1236', 'group_3245',
       ...
       'group_1609_5', 'group_2066_1', 'group_955_2', 'group_1382_5',
       'group_2149_7', 'group_2067_4', 'group_2068_8', 'group_2069_3',
       'group_2071_4', 'Phylogroup'],
      dtype='object', length=45534)

In [8]:
merged_df.columns

Index(['group_5501', 'group_4783', 'group_4456', 'group_6055', 'group_2859',
       'group_3832', 'group_4223', 'group_2401', 'group_1236', 'group_3245',
       ...
       'group_2066_1', 'mukF_1', 'group_955_2', 'group_1382_5', 'group_2149_7',
       'group_2067_4', 'group_2068_8', 'group_2069_3', 'group_2071_4',
       'Phylogroup'],
      dtype='object', length=55040)

In [9]:
genes_with_names = np.array(merged_df.columns.symmetric_difference(merged_df.filter(regex='group').columns))

In [64]:
import requests
import concurrent
from concurrent.futures import ThreadPoolExecutor, as_completed

In [65]:
import requests

def get_ncbi_geneid(gene_name):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term={gene_name}[Gene%20Name]"
    response = requests.get(url)
    gene_id = None
    if response.status_code == 200:
        from xml.etree import ElementTree as ET
        root = ET.fromstring(response.content)
        id_list = root.find('IdList')
        if id_list is not None and len(id_list):
            gene_id = id_list[0].text
    return gene_id


In [66]:
def get_kegg_id(ncbi_geneid):
    url = f"https://rest.kegg.jp/conv/eco/ncbi-geneid:{ncbi_geneid}"
    response = requests.get(url)
    kegg_id = None
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        if lines and "\t" in lines[0]:
            kegg_id = lines[0].split("\t")[1]
    return kegg_id

In [67]:
def get_ko_terms(kegg_gene_id):
    url = f"https://rest.kegg.jp/link/ko/{kegg_gene_id}"
    response = requests.get(url)
    ko_terms = []
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        for line in lines:
            parts = line.split("\t")
            if len(parts) == 2:
                ko_terms.append(parts[1])
    return ko_terms


In [68]:
# Example usage
gene_names = ['aaaT', 'aaaT_1', 'aaaT_1_1', 'zur', 'zur_2', 'zwf']  
gene_ids = {gene: get_ncbi_geneid(gene) for gene in gene_names}
kegg_ids = {gene: get_kegg_id(gene_id) for gene, gene_id in gene_ids.items() if gene_id}
ko_terms = {gene: get_ko_terms(kegg_gene_id) for gene, kegg_gene_id in kegg_ids.items() if kegg_gene_id}


print("Gene Names to NCBI GeneIDs:", gene_ids)
print("NCBI GeneIDs to KEGG IDs:", kegg_ids)
print("KEGG Gene IDs to KO terms:", ko_terms)

Gene Names to NCBI GeneIDs: {'aaaT': '6510', 'aaaT_1': None, 'aaaT_1_1': None, 'zur': '948552', 'zur_2': '15093540', 'zwf': '91005011'}
NCBI GeneIDs to KEGG IDs: {'aaaT': None, 'zur': 'eco:b4046', 'zur_2': None, 'zwf': None}
KEGG Gene IDs to KO terms: {'zur': ['ko:K09823']}


In [69]:
for i in genes_with_names:
    print(i)

aaaT
aaaT_1
aaaT_1_1
aaaT_2
aacA4
aacA4_1
aadB
aaeA_1
aaeA_2
aaeB
aaeB_1
aaeB_2
aaeB_2_1
aaeX
aam
aarA
aarA_1
aarA_2
aas
aat
aat_1
aaxC_1
aaxC_2
abgA
abgA_1
abgB
abgB_1
abgB_2
abgB_2_1
abgT
abgT_1
abgT_1_1
abgT_2
abgT_2_1
accA
accA1
accA_2
accB
accC
accD
accD_1
accD_2
acdA
aceA
aceB
aceB_1
aceB_1_1
aceE
aceF
aceK
aceK_1
aceK_2
aceK_2_1
aceK_2_2
aceK_2_3
ackA
acm
acm_1
acm_2
acnA
acnA_1
acnB
acnB_2
acpH
acpP
acpP_2
acpP_2_1
acpS
acpS_2
acr3
acrA
acrA_2
acrB_1
acrB_2
acrB_2_1
acrB_4
acrE
acrE_2
acrF
acrF_1
acrF_1_1
acrF_2
acrF_2_1
acrF_2_2
acrR_1
acrR_2
acrZ
acs_1
acs_2
actP
actP_2
acuC
acuI
acuI_1
acuI_2
acuI_3
acuR
ada
ada_1
ade
adeP
adeP_1
adeP_2
adeP_2_1
adeQ
adeQ_1
adeQ_1_1
adeQ_2
ade_1
ade_2
adh1
adhB
adhB_1
adhB_1_1
adhB_2
adhB_2_1
adhE
adhE_1
adhE_1_1
adhE_2
adhE_3
adhP
adiA
adiA_2
adiC
adiC_1
adk
aer
aer_1
aer_2
aer_2_1
aes
afaD
afaD_1
afaD_2
afaE3
afaE3_1
agaA
agaA_1
agaC_1
agaC_2
agaC_2_1
agaS
agaS_1
aggB
aggB_1
aglB
aglB_1
aglB_1_1
aglB_1_2
aglB_2
aglB_2_1
aglB_2_2
agp
agp_1


In [92]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Step 1: Get NCBI GeneIDs
    gene_ids_future = {gene: executor.submit(get_ncbi_geneid, gene) for gene in genes_with_names}
    gene_ids = {gene: future.result() for gene, future in gene_ids_future.items()}

In [93]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Step 2: Get KEGG IDs
    kegg_ids_future = {gene: executor.submit(get_kegg_id, gene_id) for gene, gene_id in gene_ids.items() if gene_id}
    kegg_ids = {gene: future.result() for gene, future in kegg_ids_future.items()}

Failed to retrieve KEGG ID for NCBI GeneID 945305: HTTP 403Failed to retrieve KEGG ID for NCBI GeneID 16989142: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 947963: HTTP 403

Failed to retrieve KEGG ID for NCBI GeneID 947752: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 2737: HTTP 403Failed to retrieve KEGG ID for NCBI GeneID 55149: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 91007599: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 948252: HTTP 403

Failed to retrieve KEGG ID for NCBI GeneID 112602295: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 112602297: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 15311275: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 91081417: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 53945: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 91011558: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 948042: HTTP 403
Failed to retrieve KEGG ID for NCBI GeneID 106476694: HTTP 403
Failed to retrieve KEGG I

In [94]:
with concurrent.futures.ThreadPoolExecutor() as executor:   
    # Step 3: Get KO terms
    ko_terms_future = {gene: executor.submit(get_ko_terms, kegg_gene_id) for gene, kegg_gene_id in kegg_ids.items() if kegg_gene_id}
    ko_terms = {gene: future.result() for gene, future in ko_terms_future.items()}

Failed to retrieve KO terms for KEGG Gene ID eco:b0860: HTTP 403Failed to retrieve KO terms for KEGG Gene ID eco:b0864: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b3240: HTTP 403

Failed to retrieve KO terms for KEGG Gene ID eco:b3664: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b2714: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b3532: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b3551: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b1744: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b0388: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b0863: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b1000: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b1737: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b2758: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b1738: HTTP 403
Failed to retrieve KO terms for KEGG Gene ID eco:b2811: HTTP 403
Failed to retrieve KO ter

In [96]:
print("Gene Names to NCBI GeneIDs:", gene_ids)
print("NCBI GeneIDs to KEGG IDs:", kegg_ids)
print("KEGG Gene IDs to KO terms:", ko_terms)

Gene Names to NCBI GeneIDs: {'aaaT': None, 'aaaT_1': None, 'aaaT_1_1': None, 'aaaT_2': None, 'aacA4': '1208710', 'aacA4_1': None, 'aadB': '89492229', 'aaeA_1': None, 'aaeA_2': None, 'aaeB': '947747', 'aaeB_1': None, 'aaeB_2': None, 'aaeB_2_1': None, 'aaeX': '57669230', 'aam': None, 'aarA': None, 'aarA_1': None, 'aarA_2': None, 'aas': None, 'aat': None, 'aat_1': None, 'aaxC_1': None, 'aaxC_2': None, 'abgA': None, 'abgA_1': None, 'abgB': None, 'abgB_1': '17186657', 'abgB_2': '17187045', 'abgB_2_1': None, 'abgT': None, 'abgT_1': None, 'abgT_1_1': None, 'abgT_2': None, 'abgT_2_1': None, 'accA': None, 'accA1': None, 'accA_2': None, 'accB': None, 'accC': None, 'accD': '91009532', 'accD_1': None, 'accD_2': None, 'acdA': None, 'aceA': None, 'aceB': None, 'aceB_1': '14961959', 'aceB_1_1': None, 'aceE': '91010966', 'aceF': '91010965', 'aceK': None, 'aceK_1': None, 'aceK_2': None, 'aceK_2_1': None, 'aceK_2_2': None, 'aceK_2_3': None, 'ackA': None, 'acm': None, 'acm_1': None, 'acm_2': None, 'acnA'

In [97]:
len([k for k,v in gene_ids.items() if v != None])

688

In [98]:
len([k for k,v in kegg_ids.items() if v != None])

75

In [99]:
len([k for k,v in ko_terms.items() if v != ''])

75