# ARG extraction

This notebook aims to access ARG info from PATRIC using its api ad=n cleaning it into useful csv files

_make it a module or add functions it to another module_

## Imports

In [18]:
import requests
import os
import sys
import pandas as pd
import warnings

warnings.filterwarnings('ignore') # for pandas append

os.chdir(os.path.expanduser('~/capstone-project'))

species = 'Escherichia_coli'

## parsing PATRIC through api to get ARGs

### functions

In [5]:
def get_PATRIC_from_genome_id(genome_id:str):
    ''' 
    -- requires the requests and pandas library --
    takes a genome id and queries for speciality genes that are present in it, particularly Antibiotic resistant genes

    param:
    ------
        - genome_id: str, genome id of the organism, of the form d+.d+

    return:
    --------
        - df: pd.DataFrame, dataframe containing the speciality genes found in the genome

    query for ARGs from genome_id and property - so doing it for all the genome ids found
              -- in this case i would need the following cols
                classification, product, patric_id, antibiotics, feature_id, gene, refseq_locus_tag, source_value, evidence, query_coverage
    '''
    link = f"https://www.bv-brc.org/api/sp_gene/?and(eq(genome_id,%22{genome_id}%22),eq(property,%22Antibiotic%20Resistance%22))"
    response = requests.get(link)
    data = response.json()

    df = pd.DataFrame(columns=['patric_id','classification', 'product', 'antibiotics', 'feature_id', 'gene', 'genome_id','refseq_locus_tag', 'source', 'evidence', 'query_coverage'])
    for i, entry in enumerate(data):
        df.loc[i] = entry

    # -- these entries are given as an array of strings, transform to a single string within the dataframe
    df['antibiotics'] = df['antibiotics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
    df['classification'] = df['classification'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

    # -- indexing by the paric_id (pk of the gene)
    df.set_index('patric_id', inplace=True)
    
    return df

def get_PATRIC_from_genome_id_list(genome_id_list:str):
    ''' 
    -- automates the get_PATRIC_from_genome_id function on all

    param:
        - genome_id_list: list
    return:
        - df: pd.DataFrame

    '''
    df= pd.DataFrame()
    for i in genome_id_list:
        temp_df = get_PATRIC_from_genome_id(i)
        df = pd.concat([df, temp_df])

    return df

get_genome_ids = lambda path: pd.read_csv(path).astype(str)['genome.genome_ids'].to_list()

### dataframes

saving data and memoizing it in `data/ARG/`

In [9]:
genome_ids = get_genome_ids(f'data/pangenome_pipeline_output/genome_ids/{species}_genome_ids.csv')
df = get_PATRIC_from_genome_id_list(genome_ids)
df.to_csv('../data/ARG/Escherichia_coli_ARG.csv')

In [5]:
df = pd.read_csv('../data/ARG/Escherichia_coli_ARG.csv') # -- proceed from here without loading again

## Querying the df

In [52]:
# -- geting all those that have either of the 11 preselected antibiotics:
#       'streptomycin','sulfamethoxazole','tetracycline','cefalothin','trimethoprim_sulphamethoxazole','trimethoprim','amoxicillin','ampicillin'

antibiotics = ['streptomycin',
 'sulfamethoxazole',
 'tetracycline',
 'cefalothin',
 'trimethoprim_sulphamethoxazole',
 'amoxicillin_clavulanic_acid',
 'trimethoprim',
 'amoxicillin',
 'ampicillin',
 'levofloxacin',
 'ciprofloxacin']

antibiotics_str = '|'.join(antibiotics)

new_df = df[df['antibiotics'].str.contains(antibiotics_str, case=False, na=False)]

#  checking if there is nan in product
# new_df[new_df['product'].isna()]
#  no nans :)

new_df.to_csv('data/ARG/Escherichia_coli_ARG_sp_drugs.csv')

In [53]:
# -- for each drug we save the list of unique ARG products

antibitotic_ARG_products = pd.DataFrame(columns=['antibiotics', 'ARG_products'])
filtered_antibiotics = antibiotics
count_ARG ={}

for drug in antibiotics:
    temp_df = new_df[new_df['antibiotics'].str.contains(drug, case=False, na=False)]
    c=len(temp_df['product'].unique().tolist())
    print(f'{drug} has {c} unique ARG products')

    if c ==0:
        print(f'{drug} has no unique ARG products, dropped out')
        print('---'*10)
        filtered_antibiotics.remove(drug)
        continue

    print(temp_df['product'].unique())
    print('---'*10)
    antibitotic_ARG_products.loc[drug] = [drug, temp_df['product'].unique().tolist()]
    count_ARG[drug] = c


antibitotic_ARG_products.set_index('antibiotics', inplace=True)
antibitotic_ARG_products.to_csv('data/ARG/Escherichia_coli_ARG_sp_drugs_products.csv')
antibitotic_ARG_products

streptomycin has 6 unique ARG products
['SSU ribosomal protein S12p (S23e)'
 "Aminoglycoside 3''-nucleotidyltransferase (EC 2.7.7.-) => ANT(3'')-Ia (AadA family)"
 '16S rRNA (guanine(527)-N(7))-methyltransferase (EC 2.1.1.170)'
 'Aminoglycoside 6-phosphotransferase (EC 2.7.1.72) => APH(6)-Ic/APH(6)-Id'
 "Aminoglycoside 3''-phosphotransferase (EC 2.7.1.87) => APH(3'')-I"
 'iron aquisition outermembrane yersiniabactin receptor (FyuA,Psn,pesticin receptor) @ Outer membrane receptor for ferric siderophore']
------------------------------
sulfamethoxazole has 3 unique ARG products
['Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM'
 'Dihydropteroate synthase (EC 2.5.1.15)'
 'Dihydropteroate synthase type-2 (EC 2.5.1.15) @ Sulfonamide resistance protein']
------------------------------
tetracycline has 20 unique ARG products
['Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM'
 'Multidrug efflux syst

Unnamed: 0_level_0,ARG_products
antibiotics,Unnamed: 1_level_1
streptomycin,"[SSU ribosomal protein S12p (S23e), Aminoglyco..."
sulfamethoxazole,"[Multidrug efflux system, inner membrane proto..."
tetracycline,"[Multidrug efflux system, inner membrane proto..."
cefalothin,"[Multiple antibiotic resistance protein MarR, ..."
trimethoprim,"[Multidrug efflux system, inner membrane proto..."
amoxicillin,"[Multidrug efflux system, inner membrane proto..."
ampicillin,"[Multidrug efflux system, inner membrane proto..."
levofloxacin,"[DNA gyrase subunit A (EC 5.99.1.3), Pentapept..."
ciprofloxacin,[Aminoglycoside N(6')-acetyltransferase (EC 2....


In [56]:
print(filtered_antibiotics)
count_ARG

['streptomycin', 'sulfamethoxazole', 'tetracycline', 'cefalothin', 'amoxicillin_clavulanic_acid', 'trimethoprim', 'amoxicillin', 'ampicillin', 'levofloxacin', 'ciprofloxacin']


{'streptomycin': 6,
 'sulfamethoxazole': 3,
 'tetracycline': 20,
 'cefalothin': 7,
 'trimethoprim': 2,
 'amoxicillin': 4,
 'ampicillin': 11,
 'levofloxacin': 4,
 'ciprofloxacin': 14}