In [52]:
#import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [53]:
# antibiotic data
file_path = '/Users/renata.m/Desktop/Thesis/Rashidi_Data_2022/AntibioticsTable.csv'
antibiotics_data = pd.read_csv(file_path)

In [54]:
#data processing
# Remove the first two rows and reset the column headers
antibiotics_data_cleaned = antibiotics_data.drop([0, 1])
antibiotics_data_cleaned.columns = antibiotics_data_cleaned.iloc[0]
antibiotics_data_cleaned = antibiotics_data_cleaned.drop(antibiotics_data_cleaned.index[0])

# Drop rows with missing values in 'StartDayRelativeToD1Chemo' and 'StopDayRelativeToD1Chemo'
antibiotics_data_cleaned = antibiotics_data_cleaned.dropna(subset=['StartDayRelativeToD1Chemo', 'StopDayRelativeToD1Chemo'])

# Convert columns to appropriate data types for processing
antibiotics_data_cleaned['StartDayRelativeToD1Chemo'] = antibiotics_data_cleaned['StartDayRelativeToD1Chemo'].astype(int)
antibiotics_data_cleaned['StopDayRelativeToD1Chemo'] = antibiotics_data_cleaned['StopDayRelativeToD1Chemo'].astype(int)


In [55]:
# Filter data
antibiotics_data_cleaned = antibiotics_data_cleaned[
    (antibiotics_data_cleaned['StartDayRelativeToD1Chemo'] >= -5) &
    (antibiotics_data_cleaned['StopDayRelativeToD1Chemo'] <= 30)]

# Group by patient and sort by start day
grouped_patients = antibiotics_data_cleaned.groupby('Patient_ID').apply(lambda x: x.sort_values('StartDayRelativeToD1Chemo')).reset_index(drop=True)

# Function to isolate single abx administration
def filter_non_overlapping_abx(df):
    non_overlapping = []
    for i in range(len(df)):
        current_abx = df.iloc[i]
        overlap = False
        for j in range(len(df)):
            if i != j:
                other_abx = df.iloc[j]
                if current_abx['StartDayRelativeToD1Chemo'] <= other_abx['StopDayRelativeToD1Chemo'] and current_abx['StopDayRelativeToD1Chemo'] >= other_abx['StartDayRelativeToD1Chemo']:
                    overlap = True
                    if current_abx['StartDayRelativeToD1Chemo'] < other_abx['StartDayRelativeToD1Chemo']:
                         non_overlapping.append({
                            'Patient_ID': current_abx['Patient_ID'],
                            'AntibacterialABx': current_abx['AntibacterialABx'],
                            'StartDayRelativeToD1Chemo': current_abx['StartDayRelativeToD1Chemo'],
                            'StopDayRelativeToD1Chemo': other_abx['StartDayRelativeToD1Chemo'] - 1,
                            'Route': current_abx['Route']
                        })
                    break
        if not overlap:
            non_overlapping.append(current_abx.to_dict())
    return pd.DataFrame(non_overlapping)

# Apply function
filtered_abx = grouped_patients.groupby('Patient_ID').apply(filter_non_overlapping_abx).reset_index(drop=True)

filtered_abx.head()


  grouped_patients = antibiotics_data_cleaned.groupby('Patient_ID').apply(lambda x: x.sort_values('StartDayRelativeToD1Chemo')).reset_index(drop=True)
  filtered_abx = grouped_patients.groupby('Patient_ID').apply(filter_non_overlapping_abx).reset_index(drop=True)


Unnamed: 0,Patient_ID,AntibacterialABx,StartDayRelativeToD1Chemo,StopDayRelativeToD1Chemo,Route
0,7D001,Cefepime,-1.0,8.0,IV
1,7D002,Piperacillin-Tazobactam,-5.0,-5.0,IV
2,7D002,Cefepime,9.0,12.0,IV
3,7D003,Levofloxacin,1.0,8.0,PO
4,7D006,Levofloxacin,2.0,12.0,PO


In [56]:
#sra metadata
sra_metadata =  pd.read_csv('/Users/renata.m/QIIME2/qiime2-final_analysis-silva_abx/sra_metadata_time.csv')
sra_metadata = sra_metadata.drop(['Unnamed: 0'], axis=1)
sra_metadata['SampleDayRelativeToD1Chemo'] = sra_metadata['SampleDayRelativeToD1Chemo'].astype(int)

# Merge data
new_merged_data = pd.merge(sra_metadata, filtered_abx, on='Patient_ID', how='left')

# Create columns for each antibiotic indicating before, during, or after treatment
antibiotics_list = filtered_abx['AntibacterialABx'].unique()


for abx in antibiotics_list:
    col_treatment = f'Treatment_{abx}'
    new_merged_data[col_treatment] = 'No Treatment'
    
    for idx, row in new_merged_data.iterrows():
        abx_rows = filtered_abx[(filtered_abx['Patient_ID'] == row['Patient_ID']) & 
                                    (filtered_abx['AntibacterialABx'] == abx)]
        for _, abx_row in abx_rows.iterrows():
            if row['SampleDayRelativeToD1Chemo'] == (abx_row['StartDayRelativeToD1Chemo']-1):
                new_merged_data.at[idx, col_treatment] = 'Before Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == (abx_row['StartDayRelativeToD1Chemo']-2):
                new_merged_data.at[idx, col_treatment] = 'Before Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == (abx_row['StartDayRelativeToD1Chemo']-3):
                new_merged_data.at[idx, col_treatment] = 'Before Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == (abx_row['StartDayRelativeToD1Chemo']-4):
                new_merged_data.at[idx, col_treatment] = 'Before Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == (abx_row['StartDayRelativeToD1Chemo']-5):
                new_merged_data.at[idx, col_treatment] = 'Before Treatment'
            elif abx_row['StartDayRelativeToD1Chemo'] <= row['SampleDayRelativeToD1Chemo'] <= abx_row['StopDayRelativeToD1Chemo']:
                new_merged_data.at[idx, col_treatment] = 'After Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == abx_row['StopDayRelativeToD1Chemo']+1:
                new_merged_data.at[idx, col_treatment] = 'After Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == abx_row['StopDayRelativeToD1Chemo']+2:
                new_merged_data.at[idx, col_treatment] = 'After Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == abx_row['StopDayRelativeToD1Chemo']+3:
                new_merged_data.at[idx, col_treatment] = 'After Treatment'
            elif row['SampleDayRelativeToD1Chemo'] == abx_row['StopDayRelativeToD1Chemo']+4:
                new_merged_data.at[idx, col_treatment] = 'After Treatment'


# new data
new_merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 733 non-null    object 
 1   Patient_ID                         733 non-null    object 
 2   SampleDayRelativeToD1Chemo         733 non-null    int64  
 3   Age                                733 non-null    int64  
 4   Sex                                733 non-null    object 
 5   Disease_Phase                      733 non-null    object 
 6   Chemotherapy                       733 non-null    object 
 7   FirstNFDayRelativeToD1Chemo        712 non-null    float64
 8   DiarrheaStartDayRelativeToD1Chemo  420 non-null    float64
 9   DiarrheaEndDayRelativeToD1Chemo    420 non-null    float64
 10  FirstCDiffDayRelativeToD1Chemo     294 non-null    float64
 11  CDiff_infection                    733 non-null    int64  

In [57]:
new_merged_data = new_merged_data.drop(['AntibacterialABx', 'StartDayRelativeToD1Chemo', 'StopDayRelativeToD1Chemo','Route','Treatment_TMP-SMX', 'Treatment_Cefazolin', 'Treatment_Amox-Clav'], axis=1)
new_merged_data = new_merged_data.drop_duplicates()
new_merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 0 to 732
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 494 non-null    object 
 1   Patient_ID                         494 non-null    object 
 2   SampleDayRelativeToD1Chemo         494 non-null    int64  
 3   Age                                494 non-null    int64  
 4   Sex                                494 non-null    object 
 5   Disease_Phase                      494 non-null    object 
 6   Chemotherapy                       494 non-null    object 
 7   FirstNFDayRelativeToD1Chemo        473 non-null    float64
 8   DiarrheaStartDayRelativeToD1Chemo  289 non-null    float64
 9   DiarrheaEndDayRelativeToD1Chemo    289 non-null    float64
 10  FirstCDiffDayRelativeToD1Chemo     176 non-null    float64
 11  CDiff_infection                    494 non-null    int64  
 12 

In [58]:
new_merged_data = new_merged_data.rename({'Treatment_Piperacillin-Tazobactam':'Treatment_Piperacillin_Tazobactam'}, axis=1)
new_merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 0 to 732
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 494 non-null    object 
 1   Patient_ID                         494 non-null    object 
 2   SampleDayRelativeToD1Chemo         494 non-null    int64  
 3   Age                                494 non-null    int64  
 4   Sex                                494 non-null    object 
 5   Disease_Phase                      494 non-null    object 
 6   Chemotherapy                       494 non-null    object 
 7   FirstNFDayRelativeToD1Chemo        473 non-null    float64
 8   DiarrheaStartDayRelativeToD1Chemo  289 non-null    float64
 9   DiarrheaEndDayRelativeToD1Chemo    289 non-null    float64
 10  FirstCDiffDayRelativeToD1Chemo     176 non-null    float64
 11  CDiff_infection                    494 non-null    int64  
 12 

In [61]:
#merge with abx decay
abx_decay_df = pd.read_csv('/Users/renata.m/QIIME2/qiime2-final_analysis-silva_abx/antibiotic_decay.csv')
abx_decay_df = abx_decay_df.drop(['Unnamed: 0'], axis=1)
merged_data_abx = new_merged_data.merge(abx_decay_df, on='ID', how='inner')
merged_data_abx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 494 non-null    object 
 1   Patient_ID                         494 non-null    object 
 2   SampleDayRelativeToD1Chemo         494 non-null    int64  
 3   Age                                494 non-null    int64  
 4   Sex                                494 non-null    object 
 5   Disease_Phase                      494 non-null    object 
 6   Chemotherapy                       494 non-null    object 
 7   FirstNFDayRelativeToD1Chemo        473 non-null    float64
 8   DiarrheaStartDayRelativeToD1Chemo  289 non-null    float64
 9   DiarrheaEndDayRelativeToD1Chemo    289 non-null    float64
 10  FirstCDiffDayRelativeToD1Chemo     176 non-null    float64
 11  CDiff_infection                    494 non-null    int64  

In [60]:
#data for qiime2 analysis
#merged_data_abx.to_csv('metadata_abx_before_after_5d.csv')