In [35]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)

In [36]:
df = pd.read_excel(r'../../data/raw/Omadacycline_2014_to_2023_Surveillance_data.xlsx')
df.head()

Unnamed: 0,Collection Number,Study Year,Organism,Omadacycline,Doxycycline,Minocycline,Tetracycline,Tigecycline,Oxacillin,Ceftaroline,Ceftriaxone,Amoxicillin-\nclavulanic acid,Piperacillin-\ntazobactam,Levofloxacin,Erythromycin,Clindamycin,Linezolid,Daptomycin,Vancomycin,Teicoplanin,Gentamicin,Amikacin,Ampicillin,Azithromycin,Aztreonam,Cefepime,Ceftazidime,Colistin,Imipenem,Moxifloxacin,Penicillin,Trimethoprim-sulfamethoxazole,Continent,Country,US Census Division,Nosocomial,Age,Gender,Medical Service,Infection Source,Infection Type,Source of BSI,Specimen Type,VAP,ICU,CF Patient
0,812893,2014,Escherichia coli,0.5,2.0,,1.0,0.12,,,≤0.06,4,,≤0.12,,,,,,,2,,,,0.25,,0.25,,≤0.12,,,≤0.5,North America,USA,2: Middle Atlantic,community-acquired,23.0,F,Emergency,Urine/Urinary tract,urinary tract infection,,Urine/Urinary tract,,non-ICU,
1,812906,2014,Klebsiella pneumoniae,1.0,1.0,,1.0,0.12,,,≤0.06,2,,≤0.12,,,,,,,≤1,,,,≤0.12,,0.12,,≤0.12,,,>4,North America,USA,2: Middle Atlantic,community-acquired,52.0,F,Emergency,Urine/Urinary tract,urinary tract infection,,Urine/Urinary tract,,non-ICU,
2,812929,2014,Escherichia coli,0.5,1.0,≤0.004,1.0,0.06,,,≤0.06,4,,≤0.12,,,,,,,≤1,,,,≤0.12,,0.12,,≤0.12,,,≤0.5,North America,USA,2: Middle Atlantic,community-acquired,1.0,F,Ambulatory/Outpatient,Urine/Urinary tract,urinary tract infection,,Urine/Urinary tract,,non-ICU,
3,812965,2014,Streptococcus pneumoniae,0.03,0.12,,0.25,0.03,,,≤0.06,≤1,,1,≤0.12,≤0.25,,,,,,,,,,,,,,,≤0.06,≤0.5,North America,USA,9: Pacific,,59.0,F,Cardiothoracic/Pulmonary,Blood culture,community-acquired respiratory tract infection,,Blood culture,,,
4,812969,2014,Staphylococcus aureus,0.12,0.12,,0.12,0.06,>2,,,,,4,>16,>2,1.0,0.5,2.0,,≤1,,,,,,,,,,,≤0.5,North America,USA,9: Pacific,nosocomial,59.0,F,,,bloodstream infection,,Blood culture,,,


In [37]:
# Rename the antibiotics in the KEYSTONE dataset to match the Hilmit structure
df.rename(columns={
    'Collection Number': 'Isolate Number',
    'Organism': 'Species',
    'Study Year': 'Year',
    'Infection Source': 'Source of Infection',
    'Amoxicillin-\nclavulanic acid': 'Amoxicillin-clavulanate',
    'Ampicillin': 'Ampicillin',
    'Piperacillin-\ntazobactam': 'Piperacillin tazobactam',
    'Ceftriaxone': 'Ceftriaxone',
    'Ceftazidime': 'Ceftazidime',
    'Cefepime': 'Cefepime',
    'Imipenem': 'Imipenem',
    'Meropenem': 'Meropenem',
    'Ciprofloxacin': 'Ciprofloxacin',
    'Levofloxacin': 'Levofloxacin',
    'Amikacin': 'Amikacin',
    'Gentamicin': 'Gentamicin',
    'Vancomycin': 'Vancomycin',
    'Azithromycin': 'Azithromycin',
    'Clarithromycin': 'Clarithromycin',
    'Tigecycline': 'Tigecycline',
    'Linezolid': 'Linezolid',
    'Trimethoprim-sulfamethoxazole': 'Trimethoprim-sulfamethoxazole'
}, inplace=True)

In [38]:
# List of columns that should be kept based on the Hilmit structure and the antibiotics list
columns_to_keep = [
    'Isolate Number', 'Species', 'Family', 'Gender', 'Age', 'Country', 'State', 'Year', 'Source of Infection',
    'Amoxicillin-clavulanate', 'Ampicillin', 'Piperacillin tazobactam', 'Ceftriaxone', 'Ceftazidime', 'Cefepime',
    'Imipenem', 'Meropenem', 'Ciprofloxacin', 'Levofloxacin', 'Amikacin', 'Gentamicin', 'Vancomycin', 
    'Azithromycin', 'Clarithromycin', 'Tigecycline', 'Linezolid', 'Trimethoprim-sulfamethoxazole'
]

# Filter columns that actually exist in the dataframe
existing_columns = [col for col in columns_to_keep if col in df.columns]

# Filter the dataframe to keep only the existing columns
df_filtered = df[existing_columns]


In [39]:
# Define the desired column order for Hilmit
columns_order = [
    'Isolate Number', 'Species', 'Gender', 'Age', 'Country', 'Year', 'Source of Infection',
    'Amoxicillin-clavulanate', 'Ampicillin', 'Piperacillin tazobactam', 'Ceftriaxone', 'Ceftazidime', 'Cefepime',
    'Imipenem', 'Levofloxacin', 'Amikacin', 'Gentamicin', 'Vancomycin', 
    'Azithromycin', 'Tigecycline', 'Linezolid', 'Trimethoprim-sulfamethoxazole'
]

# Reorder the dataframe according to the specified column order
df_filtered = df_filtered[columns_order]

In [40]:
# Get all unique non-numeric values from `Age`
non_numeric_balance_value = df[pd.to_numeric(df['Age'], errors='coerce').isna()]['Age'].unique()

# Print all unique non-numeric values from `Age`
print(non_numeric_balance_value)

[nan]


In [41]:
# Drop columns with null values in the `Age` column and convert the column to int
df_filtered = df_filtered.dropna(subset=['Age'])
df_filtered['Age'] = pd.to_numeric(df['Age'])
df_filtered['Age'].isna().value_counts()

Age
False    85827
Name: count, dtype: int64

In [42]:
# Replace abbreviations in the Gender column and handle empty values
df_filtered['Gender'] = df_filtered['Gender'].replace({'M': 'Male', 'F': 'Female'}).fillna('Unknown')

In [43]:
# Define the function to group ages
def group_ages(age):
    if pd.isna(age):
        return 'Unknown'
    elif age >= 0 and age <= 2:
        return '0 to 2 Years'
    elif age >= 3 and age <= 12:
        return '3 to 12 Years'
    elif age >= 13 and age <= 18:
        return '13 to 18 Years'
    elif age >= 19 and age <= 64:
        return '19 to 64 Years'
    elif age >= 65 and age <= 84:
        return '65 to 84 Years'
    elif age >= 85:
        return '85 and Over'
    else:
        return 'Unknown'

# Apply the function to the 'Age' column to create the 'Age Group' column
df_filtered['Age Group'] = df_filtered['Age'].apply(group_ages)

# Move the 'Age Group' column right after the 'Age' column
age_group_col = df_filtered.pop('Age Group')
age_col_index = df_filtered.columns.get_loc('Age')  # Get the index of the 'Age' column
df_filtered.insert(age_col_index + 1, 'Age Group', age_group_col)

In [44]:
# Remove the 'Age' column
df_filtered.drop(columns=['Age'], inplace=True)


In [45]:
# Insert 'Data Source' column after 'Isolate Number' with value 'KEYSTONE'
df_filtered.insert(df_filtered.columns.get_loc('Isolate Number') + 1, 'Data Source', 'KEYSTONE')
df_filtered.head()


Unnamed: 0,Isolate Number,Data Source,Species,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Ampicillin,Piperacillin tazobactam,Ceftriaxone,Ceftazidime,Cefepime,Imipenem,Levofloxacin,Amikacin,Gentamicin,Vancomycin,Azithromycin,Tigecycline,Linezolid,Trimethoprim-sulfamethoxazole
0,812893,KEYSTONE,Escherichia coli,Female,19 to 64 Years,USA,2014,Urine/Urinary tract,4,,,≤0.06,0.25,,≤0.12,≤0.12,,2,,,0.12,,≤0.5
1,812906,KEYSTONE,Klebsiella pneumoniae,Female,19 to 64 Years,USA,2014,Urine/Urinary tract,2,,,≤0.06,0.12,,≤0.12,≤0.12,,≤1,,,0.12,,>4
2,812929,KEYSTONE,Escherichia coli,Female,0 to 2 Years,USA,2014,Urine/Urinary tract,4,,,≤0.06,0.12,,≤0.12,≤0.12,,≤1,,,0.06,,≤0.5
3,812965,KEYSTONE,Streptococcus pneumoniae,Female,19 to 64 Years,USA,2014,Blood culture,≤1,,,≤0.06,,,,1,,,,,0.03,,≤0.5
4,812969,KEYSTONE,Staphylococcus aureus,Female,19 to 64 Years,USA,2014,,,,,,,,,4,,≤1,2.0,,0.06,1.0,≤0.5


In [46]:
# Export df_filtered to a CSV file named 'KEYSTONE_dataset.csv'
df_filtered.to_csv('../../data/processed/KEYSTONE_dataset.csv', index=False)
