In [2]:
import requests
import os
import sys
import pandas as pd

## parsing PATRIC through api to get ARGs

In [3]:
def get_PATRIC_from_genome_id(genome_id:str):
    ''' 
    -- requires the requests and pandas library --
    takes a genome id and queries for speciality genes that are present in it, particularly Antibiotic resistant genes

    param:
    ------
        - genome_id: str, genome id of the organism, of the form d+.d+

    return:
    --------
        - df: pd.DataFrame, dataframe containing the speciality genes found in the genome

    query for ARGs from genome_id and property - so doing it for all the genome ids found
              -- in this case i would need the following cols
                classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage
    '''
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22{genome_id}%22),eq(property,%22Antibiotic%20Resistance%22))"
    response = requests.get(link)
    data = response.json()

    df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
    for i, entry in enumerate(data):
        df.loc[i] = entry

    # -- these entries are given as an array of strings, transform to a single string within the dataframe
    df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
    df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

    # -- indexing by the paric_id (pk of the gene)
    df.set_index('patric_id', inplace=True)
    
    return df

def get_PATRIC_from_genome_id_list(genome_id_list:str):
    ''' 
    -- automates the get_PATRIC_from_genome_id function on all

    param:
        - genome_id_list: list
    return:
        - df: pd.DataFrame

    '''
    df= pd.DataFrame()
    for i in genome_id_list:
        temp_df = get_PATRIC_from_genome_id(i)
        df = pd.concat([df, temp_df])

    return df



In [4]:
get_genome_ids = lambda path: pd.read_csv(path).astype(str)['genome.genome_ids'].to_list()

genome_ids = get_genome_ids('../pangenome-repo/Pangenome-Analysis-Workflow/genome_ids/Escherichia_coli_genome_ids.csv')

df = get_PATRIC_from_genome_id_list(genome_ids)

KeyboardInterrupt: 

In [5]:
df.to_csv('../data/ARG/Escherichia_coli_ARG.csv')

In [5]:
df = pd.read_csv('../data/ARG/Escherichia_coli_ARG.csv')

## Querying the df

In [6]:
# get all those that have either of these in antibiotics:'streptomycin','sulfamethoxazole','tetracycline','cefalothin','trimethoprim_sulphamethoxazole','trimethoprim','amoxicillin','ampicillin','doripenem'

antibiotics = ['streptomycin',
 'sulfamethoxazole',
 'tetracycline',
 'cefalothin',
 'trimethoprim_sulphamethoxazole',
 'amoxicillin_clavulanic_acid',
 'trimethoprim',
 'amoxicillin',
 'ampicillin',
 'doripenem',
 'levofloxacin',
 'ciprofloxacin']

antibiotics_str = '|'.join(antibiotics)

new_df = df[df['antibiotics'].str.contains(antibiotics_str, case=False, na=False)]

new_df.to_csv('../data/ARG/Escherichia_coli_ARG_sp_drugs.csv')

In [7]:
new_df
# get those with unique product
new_df['gene'].unique()

array([nan, 'mfd', 'cmr', 'folA', 'gyrA', 'ybhT', 'bla_1', 'tetA',
       'acrR_1', 'acrF', 'rpsL', 'neo', 'mdfA', 'emrB', 'emrY', 'tolC',
       'rpsJ', 'tetD', 'gidB', 'sul1', 'gyrB', 'emrK', 'msbA', 'blaT-6',
       'acrR', 'marR', 'fyuA', 'acrA', 'blaTEM', 'tetC', 'sul2',
       'aph(6)-I', 'hns', 'marA', 'acrB', 'MarB', 'S10p', 'AcrAB-TolC',
       'AcrEF-TolC ', 'folA, Dfr', 'H-NS', 'EmrKY-TolC', 'MarR',
       'APH(6)-Ic/APH(6)-Id', 'folP', 'MdfA/Cmr', 'OXA-1 family', 'AcrZ',
       'S12p', 'QnrB family', 'MarA', 'TEM family', "APH(3'')-I",
       'Tet(A)', 'MexAB-OprM', 'NDM family', 'Tet(B)', 'sul3',
       'SHV family', 'folP2', 'acrE', 'folA_1', 'emrB_2', 'sulI',
       'acrB_1', 'aadA', 'emrB_1', 'dhfrV', 'marB', 'mexB_2', 'bla',
       'hns_1', 'dhfrI', 'dfrD', 'hns_2', 'marA_1', 'strA_1', 'strA_2',
       'bla_2', 'marR_2', 'aadA1', 'dfrA', 'mexB_1', 'mexB', 'folA_2',
       'strA', 'marR_1', 'acrB_2'], dtype=object)

In [9]:
for a in antibiotics:
    print(a)
    print(new_df[new_df['antibiotics'].str.contains(a, case=False, na=False)]['product'].unique())
    print('---'*10)

streptomycin
['SSU ribosomal protein S12p (S23e)'
 "Aminoglycoside 3''-nucleotidyltransferase (EC 2.7.7.-) => ANT(3'')-Ia (AadA family)"
 '16S rRNA (guanine(527)-N(7))-methyltransferase (EC 2.1.1.170)'
 'Aminoglycoside 6-phosphotransferase (EC 2.7.1.72) => APH(6)-Ic/APH(6)-Id'
 "Aminoglycoside 3''-phosphotransferase (EC 2.7.1.87) => APH(3'')-I"
 'iron aquisition outermembrane yersiniabactin receptor (FyuA,Psn,pesticin receptor) @ Outer membrane receptor for ferric siderophore']
------------------------------
sulfamethoxazole
['Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM'
 'Dihydropteroate synthase (EC 2.5.1.15)'
 'Dihydropteroate synthase type-2 (EC 2.5.1.15) @ Sulfonamide resistance protein']
------------------------------
tetracycline
['Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM'
 'Multidrug efflux system EmrKY-TolC, membrane fusion component EmrK'
 'Multiple antibiotic resistance

In [16]:
print(new_df[new_df['antibiotics'].str.contains('trimethoprim', case=False, na=False)]['product'].unique())

['Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM'
 'Dihydrofolate reductase (EC 1.5.1.3)']


In [12]:
print(len(new_df[new_df['antibiotics'].str.contains('trimethoprim', case=False, na=False)]['gene'].unique()))

14


In [8]:
#  checking if there is nan in product
new_df[new_df['product'].isna()]
#  no nans :)

Unnamed: 0,patric_id,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage


In [84]:
df['gene'].unique()

array(['acrA', 'acrE', 'acrB', 'Alr', 'EmrD', 'emrK', 'EmrAB-TolC',
       'parC', 'gyrA', 'soxR', 'mdtM', 'mfd', 'MacA', 'CTX-M-15', 'H-NS',
       'fyuA', 'EF-G', 'baeR', 'AcrAD-TolC', 'PmrE', 'GdpD', 'acrF',
       'MdtM', 'AcrAB-TolC'], dtype=object)

In [87]:
new_df.shape

(5910, 10)

In [83]:
new_df

Unnamed: 0_level_0,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage
patric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
fig|562.45406.peg.2489,efflux pump conferring antibiotic resistance,"Multidrug efflux system EmrKY-TolC, membrane f...",tetracycline,PATRIC.562.45406.PXXQ01000027.CDS.222865.22402...,emrK,562.45406,C7K59_23230,CARD,BLAT,90.0
fig|562.45406.peg.3700,"aminoglycoside resistance gene,tetracycline re...",iron aquisition outermembrane yersiniabactin r...,"streptomycin, tetracycline, ampicillin",PATRIC.562.45406.PXXQ01000002.CDS.261442.26346...,fyuA,562.45406,C7K59_11760,CARD,BLAT,100.0
fig|562.45406.peg.4593,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, inner-memb...","ciprofloxacin, tigecycline, chloramphenicol, r...",PATRIC.562.45406.PXXQ01000019.CDS.206354.20950...,AcrAB-TolC,562.45406,C7K59_18550,PATRIC,K-mer Search,
fig|562.45406.peg.2489,efflux pump conferring antibiotic resistance,"Multidrug efflux system EmrKY-TolC, membrane f...",tetracycline,PATRIC.562.45406.PXXQ01000027.CDS.222865.22402...,emrK,562.45406,C7K59_23230,CARD,BLAT,90.0
fig|562.45406.peg.3700,"aminoglycoside resistance gene,tetracycline re...",iron aquisition outermembrane yersiniabactin r...,"streptomycin, tetracycline, ampicillin",PATRIC.562.45406.PXXQ01000002.CDS.261442.26346...,fyuA,562.45406,C7K59_11760,CARD,BLAT,100.0
...,...,...,...,...,...,...,...,...,...,...
fig|562.45406.peg.3700,"aminoglycoside resistance gene,tetracycline re...",iron aquisition outermembrane yersiniabactin r...,"streptomycin, tetracycline, ampicillin",PATRIC.562.45406.PXXQ01000002.CDS.261442.26346...,fyuA,562.45406,C7K59_11760,CARD,BLAT,100.0
fig|562.45406.peg.4593,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, inner-memb...","ciprofloxacin, tigecycline, chloramphenicol, r...",PATRIC.562.45406.PXXQ01000019.CDS.206354.20950...,AcrAB-TolC,562.45406,C7K59_18550,PATRIC,K-mer Search,
fig|562.45406.peg.2489,efflux pump conferring antibiotic resistance,"Multidrug efflux system EmrKY-TolC, membrane f...",tetracycline,PATRIC.562.45406.PXXQ01000027.CDS.222865.22402...,emrK,562.45406,C7K59_23230,CARD,BLAT,90.0
fig|562.45406.peg.3700,"aminoglycoside resistance gene,tetracycline re...",iron aquisition outermembrane yersiniabactin r...,"streptomycin, tetracycline, ampicillin",PATRIC.562.45406.PXXQ01000002.CDS.261442.26346...,fyuA,562.45406,C7K59_11760,CARD,BLAT,100.0


# test

In [64]:
test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()
data[1]
pd.DataFrame(data).columns
# data[0]

Index(['classification', 'date_inserted', 'date_modified', 'e_value',
       'evidence', 'feature_id', 'function', 'gene', 'genome_id',
       'genome_name', 'id', 'identity', 'organism', 'owner', 'patric_id',
       'product', 'property', 'property_source', 'public', 'query_coverage',
       'refseq_locus_tag', 'source', 'source_id', 'subject_coverage',
       'taxon_id', '_version_', 'assertion', 'same_genus', 'same_species',
       'antibiotics', 'antibiotics_class', 'pmid'],
      dtype='object')

In [65]:
# test 1 : i query for ARGs from genome_id and property - so doing it for all the genome ids found
#               -- in this case i would need the following cols
#                 classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage

test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()

df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
for i, entry in enumerate(data):
    df.loc[i] = entry

df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df.head(1)

Unnamed: 0,patric_id,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage
0,fig|562.45406.peg.4594,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000019.CDS.209526.21071...,acrA,562.45406,C7K59_18555,CARD,BLAT,100.0


In [41]:
patric_ids_to_test = df['patric_id'].to_list()

In [43]:
response = requests.get("https://www.bv-brc.org/api/sp_gene/?and(eq(patric_id,'fig|562.45406.peg.2175'),eq(property,%22Antibiotic%20Resistance%22))")
response.json()

{'status': 400, 'message': "Illegal character in query string encountered ''"}

In [53]:
def get_PATRIC_from_genome_id(genome_id:str):
    ''' -- requires the requests and pandas library --
    takes a genome id and queries for speciality genes that are present in it, particularly Antibiotic resistant genes

    param:
    ------
        - genome_id: str, genome id of the organism, of the form \d+\.\d+

    return:
    --------
        - df: pd.DataFrame, dataframe containing the speciality genes found in the genome

    query for ARGs from genome_id and property - so doing it for all the genome ids found
              -- in this case i would need the following cols
                classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage
    '''
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22{genome_id}%22),eq(property,%22Antibiotic%20Resistance%22))"
    response = requests.get(test_link)
    data = response.json()

    df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
    for i, entry in enumerate(data):
        df.loc[i] = entry

    # -- these entries are given as an array of strings, transform to a single string within the dataframe
    df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
    df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

    # -- indexing by the paric_id (pk of the gene)
    df.set_index('patric_id', inplace=True)
    
    return df


  ''' -- requires the requests and pandas library --


In [54]:
df1= get_PATRIC_from_genome_id('22562.45405')
df2 = get_PATRIC_from_genome_id('22562.45406')

In [56]:
df1.shape, df2.shape, df.shape

((25, 10), (25, 10), (50, 10))

In [57]:
def get_PATRIC_from_genome_id_list(genome_id_list:str):
    ''' -- automates the get_PATRIC_from_genome_id function on all

    param:
        - genome_id_list: list
    return:
        - df: pd.DataFrame

    '''
    df= pd.DataFrame()
    for i in genome_id_list:
        temp_df = get_PATRIC_from_genome_id(i)
        df = pd.concat([df, temp_df])

    return df

In [58]:
test_list = ['22562.45405', '22562.45406']
test_df = get_PATRIC_from_genome_id_list(test_list)

In [46]:
for id in patric_ids_to_test:
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(patric_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"



test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()

df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
for i, entry in enumerate(data):
    df.loc[i] = entry

df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df.head()

Unnamed: 0,patric_id,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage
0,fig|562.45406.peg.4594,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000019.CDS.209526.21071...,acrA,562.45406,C7K59_18555,CARD,BLAT,100.0
1,fig|562.45406.peg.73,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrEF-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000001.CDS.53757.54914.rev,acrE,562.45406,C7K59_00370,CARD,BLAT,100.0
2,fig|562.45406.peg.4593,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, inner-memb...",,PATRIC.562.45406.PXXQ01000019.CDS.206354.20950...,acrB,562.45406,C7K59_18550,CARD,BLAT,99.0
3,fig|562.45406.peg.1786,antibiotic target in susceptible species,Alanine racemase (EC 5.1.1.1),D-cycloserine,PATRIC.562.45406.PXXQ01000001.CDS.1583447.1584...,Alr,562.45406,C7K59_08995,PATRIC,K-mer Search,
4,fig|562.45406.peg.2175,efflux pump conferring antibiotic resistance,Multidrug efflux pump EmrD (of MFS type),"azithromycin, gentamicin, nitrofurantoin, oxac...",PATRIC.562.45406.PXXQ01000008.CDS.138607.13979...,EmrD,562.45406,C7K59_13765,PATRIC,K-mer Search,


In [34]:
type(df['antibiotics'].to_list()[3])

list

In [37]:
df['antibiotics'].to_list()
# transform each entry in df['antibiotics'] from list to string
df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

In [22]:
562
df[df['gene'] == 'arcA']

Unnamed: 0,classification,product,antibiotics,feature_id,gene,refseq_locus_tag,source_value,evidence,query_coverage
