In [None]:
import requests
import os
import sys
import pandas as pd

## parsing PATRIC through api to get ARGs

In [63]:
def get_PATRIC_from_genome_id(genome_id:str):
    ''' 
    -- requires the requests and pandas library --
    takes a genome id and queries for speciality genes that are present in it, particularly Antibiotic resistant genes

    param:
    ------
        - genome_id: str, genome id of the organism, of the form d+.d+

    return:
    --------
        - df: pd.DataFrame, dataframe containing the speciality genes found in the genome

    query for ARGs from genome_id and property - so doing it for all the genome ids found
              -- in this case i would need the following cols
                classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage
    '''
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22{genome_id}%22),eq(property,%22Antibiotic%20Resistance%22))"
    response = requests.get(test_link)
    data = response.json()

    df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
    for i, entry in enumerate(data):
        df.loc[i] = entry

    # -- these entries are given as an array of strings, transform to a single string within the dataframe
    df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
    df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

    # -- indexing by the paric_id (pk of the gene)
    df.set_index('patric_id', inplace=True)
    
    return df

def get_PATRIC_from_genome_id_list(genome_id_list:str):
    ''' 
    -- automates the get_PATRIC_from_genome_id function on all

    param:
        - genome_id_list: list
    return:
        - df: pd.DataFrame

    '''
    df= pd.DataFrame()
    for i in genome_id_list:
        temp_df = get_PATRIC_from_genome_id(i)
        df = pd.concat([df, temp_df])

    return df



In [74]:
get_genome_ids = lambda path: pd.read_csv(path).astype(str)['genome.genome_ids'].to_list()

genome_ids = get_genome_ids('../pangenome-repo/Pangenome-Analysis-Workflow/genome_ids/Escherichia_coli_genome_ids.csv')

df = get_PATRIC_from_genome_id_list(genome_ids)

In [None]:
df.to_csv('../data/ARG/Escherichia_coli_ARG.csv')

## Querying the df

# test

In [64]:
test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()
data[1]
pd.DataFrame(data).columns
# data[0]

Index(['classification', 'date_inserted', 'date_modified', 'e_value',
       'evidence', 'feature_id', 'function', 'gene', 'genome_id',
       'genome_name', 'id', 'identity', 'organism', 'owner', 'patric_id',
       'product', 'property', 'property_source', 'public', 'query_coverage',
       'refseq_locus_tag', 'source', 'source_id', 'subject_coverage',
       'taxon_id', '_version_', 'assertion', 'same_genus', 'same_species',
       'antibiotics', 'antibiotics_class', 'pmid'],
      dtype='object')

In [65]:
# test 1 : i query for ARGs from genome_id and property - so doing it for all the genome ids found
#               -- in this case i would need the following cols
#                 classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage

test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()

df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
for i, entry in enumerate(data):
    df.loc[i] = entry

df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df.head(1)

Unnamed: 0,patric_id,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage
0,fig|562.45406.peg.4594,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000019.CDS.209526.21071...,acrA,562.45406,C7K59_18555,CARD,BLAT,100.0


In [41]:
patric_ids_to_test = df['patric_id'].to_list()

In [43]:
response = requests.get("https://www.bv-brc.org/api/sp_gene/?and(eq(patric_id,'fig|562.45406.peg.2175'),eq(property,%22Antibiotic%20Resistance%22))")
response.json()

{'status': 400, 'message': "Illegal character in query string encountered ''"}

In [53]:
def get_PATRIC_from_genome_id(genome_id:str):
    ''' -- requires the requests and pandas library --
    takes a genome id and queries for speciality genes that are present in it, particularly Antibiotic resistant genes

    param:
    ------
        - genome_id: str, genome id of the organism, of the form \d+\.\d+

    return:
    --------
        - df: pd.DataFrame, dataframe containing the speciality genes found in the genome

    query for ARGs from genome_id and property - so doing it for all the genome ids found
              -- in this case i would need the following cols
                classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage
    '''
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22{genome_id}%22),eq(property,%22Antibiotic%20Resistance%22))"
    response = requests.get(test_link)
    data = response.json()

    df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
    for i, entry in enumerate(data):
        df.loc[i] = entry

    # -- these entries are given as an array of strings, transform to a single string within the dataframe
    df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
    df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

    # -- indexing by the paric_id (pk of the gene)
    df.set_index('patric_id', inplace=True)
    
    return df


  ''' -- requires the requests and pandas library --


In [54]:
df1= get_PATRIC_from_genome_id('22562.45405')
df2 = get_PATRIC_from_genome_id('22562.45406')

In [56]:
df1.shape, df2.shape, df.shape

((25, 10), (25, 10), (50, 10))

In [57]:
def get_PATRIC_from_genome_id_list(genome_id_list:str):
    ''' -- automates the get_PATRIC_from_genome_id function on all

    param:
        - genome_id_list: list
    return:
        - df: pd.DataFrame

    '''
    df= pd.DataFrame()
    for i in genome_id_list:
        temp_df = get_PATRIC_from_genome_id(i)
        df = pd.concat([df, temp_df])

    return df

In [58]:
test_list = ['22562.45405', '22562.45406']
test_df = get_PATRIC_from_genome_id_list(test_list)

In [46]:
for id in patric_ids_to_test:
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(patric_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"



test_link = "https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22562.45406%22),eq(property,%22Antibiotic%20Resistance%22))"

response = requests.get(test_link)
data = response.json()

df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
for i, entry in enumerate(data):
    df.loc[i] = entry

df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df.head()

Unnamed: 0,patric_id,classification,product,antibiotics,feature_id,gene,genome_id,refseq_locus_tag,source,evidence,query_coverage
0,fig|562.45406.peg.4594,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000019.CDS.209526.21071...,acrA,562.45406,C7K59_18555,CARD,BLAT,100.0
1,fig|562.45406.peg.73,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrEF-TolC, membrane f...",,PATRIC.562.45406.PXXQ01000001.CDS.53757.54914.rev,acrE,562.45406,C7K59_00370,CARD,BLAT,100.0
2,fig|562.45406.peg.4593,efflux pump conferring antibiotic resistance,"Multidrug efflux system AcrAB-TolC, inner-memb...",,PATRIC.562.45406.PXXQ01000019.CDS.206354.20950...,acrB,562.45406,C7K59_18550,CARD,BLAT,99.0
3,fig|562.45406.peg.1786,antibiotic target in susceptible species,Alanine racemase (EC 5.1.1.1),D-cycloserine,PATRIC.562.45406.PXXQ01000001.CDS.1583447.1584...,Alr,562.45406,C7K59_08995,PATRIC,K-mer Search,
4,fig|562.45406.peg.2175,efflux pump conferring antibiotic resistance,Multidrug efflux pump EmrD (of MFS type),"azithromycin, gentamicin, nitrofurantoin, oxac...",PATRIC.562.45406.PXXQ01000008.CDS.138607.13979...,EmrD,562.45406,C7K59_13765,PATRIC,K-mer Search,


In [34]:
type(df['antibiotics'].to_list()[3])

list

In [37]:
df['antibiotics'].to_list()
# transform each entry in df['antibiotics'] from list to string
df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

In [22]:
562
df[df['gene'] == 'arcA']

Unnamed: 0,classification,product,antibiotics,feature_id,gene,refseq_locus_tag,source_value,evidence,query_coverage
