# Supplementary Notebook 4: Select Major ABR genes
## Paper: Novel Approach for Microbiome Analysis Using Bacterial Replication Rates and Causal Inference to Determine Resistome Potential
### Vitalii Stebliankin, Musfiqur Sazal, Camilo Valdes, Kalai Mathee, and GiriNarasimhan

#### Dataset: Gibson et al. (BioProject ID: PRJNA301903)

In this notebook we will select ABR genes that present in at least 5% of the study samples

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import os

amr = "B-out/ptr_amr.csv"
major_file="analysis-out/1-FilteringPTR/PTR_species_filtered_metadata_major.csv"

out_dir="analysis-out/4-Select_major_ABR"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
out_file = out_dir+"/PTR_species_filtered_metadata_major_AMR.csv"


clinical_vars = ["Day_of_Life", "PostMenst_Age", "Gestational_Age",
                       "Birthweight", "Gentamicin", "Cefazolin","Ampicillin", "Trimethoprim.Sulfamathoxazole", "Meropenem",
                       "Vancomycin", "Ticarcillin.Clavulanate", "Clindamycin", "Cefotaxime", "Total_abx", "r_Gentamicin",
                       "r_Meropenem", "r_Ticarcillin.Clavulanate", "r_Vancomycin", "r_Ampicillin",
                       "r_Cefotaxime","r_TOTAL","Human_Milk","Maternal_Milk", "Donor_Milk", "Formula","Fortification","Vitamin_A",
                       "Caffeine","Iron","Furosemide_Lasix","m_ampicillin","m_ceftriaxone","m_azithromycin",
                       "m_amoxicillin", "m_cefazolin","m_erythromycin","m_gentamicin","m_penicillin","m_vancomycin",
                       "m_clindamycin","m_cefotaxime", "dur_membrane_rupture","Total.Antibiotic.Days", "Cohort", "CRIB.II.Score"]

threshold = 20
#method="top_largest"
method= "top_present"

df = pd.read_csv(amr)
amr_dict={"amr":[],"value":[], "nvalues":[]}

for col in df.columns:
    if "gb|" in col:
        amr_dict["value"].append(df[col].mean())
        amr_dict["amr"].append(col)
        amr_dict["nvalues"].append(len(df[df[col]>0]))


# Threshold. Top 200
amr_sorted = sorted(amr_dict["value"])
amr_df = pd.DataFrame(amr_dict)
print(len(amr_df))

if method=="top_largest":
    # Top Top 10 %:
    top_200 = amr_sorted[-threshold]
    #top_ten = amr_df["value"].quantile(.80)
    amr_df = amr_df[amr_df["value"]>top_200]
    amr_list = list(amr_df["amr"])

if method=="top_present":
    amr_df = amr_df[amr_df["nvalues"]>threshold]
    amr_list = list(amr_df["amr"])

print(len(amr_df))

new_amr_df = df[["sample"]+amr_list]
new_amr_df.index = new_amr_df["sample"]
new_amr_df = new_amr_df.drop("sample", axis=1)
new_amr_df.rename(columns=lambda x: x.replace("\n","").split("|")[-2] + x.replace("\n","").split("|")[-1], inplace=True)

#new_amr_df["sample"] = new_amr_df.index
# Merge with major df:
major_df = pd.read_csv(major_file)

major_df = major_df.merge(new_amr_df, on="sample", how="left")
major_df = major_df.fillna(1)

cols_to_drop = ["Clindamycin","Cefotaxime"]
for col in cols_to_drop:
    major_df = major_df.drop(col, axis=1)

major_df = major_df.drop("sample", axis=1)
major_df = major_df.drop("Individual", axis=1)
major_df = major_df.drop("AveragePTR", axis=1)
major_df = major_df.drop("Cohort", axis=1)
major_df = major_df.drop("Antibiotic_Treatment", axis=1)
major_df = major_df.drop("Trimethoprim-Sulfamathoxazole", axis=1)
major_df = major_df.drop("Antibiotic_Treatment_unfiltered", axis=1)



#major_df.to_csv(out_file, index=False)
major_df.to_csv(out_file, index=False)

975
479
