In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import re
import numpy as np
pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)

In [23]:
# Load the datasets (make sure the CSV files are in the correct path)
atlas_df = pd.read_csv(r'../../data/processed/ATLAS_dataset.csv')
keystone_df = pd.read_csv(r'../../data/processed/KEYSTONE_dataset.csv')
gears_df = pd.read_csv(r'../../data/processed/GEARS_dataset.csv')

# Combine the datasets
df = pd.concat([atlas_df, keystone_df, gears_df], ignore_index=True)

# Verify that all columns are consistent and combined correctly
print(df.info())  # Optional: Check the structure of the combined dataset


Columns (14,16,19,25) have mixed types. Specify dtype option on import or set low_memory=False.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032241 entries, 0 to 1032240
Data columns (total 26 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   Isolate Number                 1032241 non-null  int64 
 1   Data Source                    1032241 non-null  object
 2   Species                        1032241 non-null  object
 3   Gender                         1022380 non-null  object
 4   Age Group                      1032241 non-null  object
 5   Country                        1032241 non-null  object
 6   Year                           1032241 non-null  int64 
 7   Source of Infection            1024491 non-null  object
 8   Amoxicillin-clavulanate        809468 non-null   object
 9   Ampicillin                     860604 non-null   object
 10  Piperacillin tazobactam        891274 non-null   object
 11  Ceftriaxone                    617909 non-null   object
 12  Ceftazidime                 

In [24]:
df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Ampicillin,Piperacillin tazobactam,Ceftriaxone,Ceftazidime,Cefepime,Imipenem,Meropenem,Ciprofloxacin,Levofloxacin,Amikacin,Gentamicin,Vancomycin,Azithromycin,Clarithromycin,Tigecycline,Linezolid,Trimethoprim-sulfamethoxazole
0,1000000,ATLAS,Pseudomonas aeruginosa,Male,85 and Over,France,2013,Urine,>32,>32,64.0,>32,16,32,,2.0,,>8,8,,,,,>8,,
1,1000001,ATLAS,Pseudomonas aeruginosa,Female,13 to 18 Years,France,2013,Ear,>32,>32,32.0,>32,8,16,,1.0,,>8,4,,,,,>8,,
2,1000002,ATLAS,Pseudomonas aeruginosa,Female,65 to 84 Years,France,2013,Urine,>32,>32,64.0,>32,>16,16,,8.0,,>8,4,,,,,8,,
3,1000003,ATLAS,Pseudomonas aeruginosa,Male,19 to 64 Years,France,2013,Skin,>32,>32,8.0,>32,2,8,,1.0,,>8,4,,,,,8,,
4,1000004,ATLAS,Serratia marcescens,Male,19 to 64 Years,France,2013,Blood,>32,>32,0.5,0.12,<=1,<=0.5,,0.12,,0.06,2,,,,,0.5,,


In [25]:
# Export the combined dataset to a CSV file
df.to_csv('../../data/processed/combined_dataset.csv', index=False)


In [26]:
# List of species of interest
species_of_interest = [
    "Escherichia coli", "Enterococcus faecalis", "Enterococcus faecium",
    "Staphylococcus aureus", "Staphylococcus capitis", "Staphylococcus cohnii",
    "Staphylococcus epidermidis", "Staphylococcus hominis", "Staphylococcus hyicus",
    "Staphylococcus lugdunensis", "Staphylococcus pettenkoferi", "Staphylococcus saprophyticus",
    "Staphylococcus schleiferi", "Staphylococcus sciuri", "Staphylococcus simulans",
    "Staphylococcus warneri", "Staphylococcus xylosus", "Klebsiella pneumoniae",
    "Klebsiella oxytoca", "Klebsiella aerogenes", "Enterobacter cloacae",
    "Enterobacter kobei", "Serratia marcescens", "Proteus mirabilis", "Proteus vulgaris",
    "Citrobacter freundii", "Citrobacter koseri", "Morganella morganii",
    "Providencia stuartii", "Providencia rettgeri", "Streptococcus pyogenes",
    "Streptococcus agalactiae", "Streptococcus dysgalactiae", "Streptococcus canis",
    "Haemophilus influenzae", "Streptococcus pneumoniae", "Pseudomonas aeruginosa",
    ]

# Filter the dataframe to include only rows with species of interest
df_filtered_species = df[df['Species'].isin(species_of_interest)]

In [27]:
# Dictionary to map species to their respective families
species_to_family = {
    "Escherichia coli": "Enterobacteriaceae",
    "Klebsiella pneumoniae": "Enterobacteriaceae",
    "Klebsiella oxytoca": "Enterobacteriaceae",
    "Klebsiella aerogenes": "Enterobacteriaceae",
    "Enterobacter cloacae": "Enterobacteriaceae",
    "Enterobacter kobei": "Enterobacteriaceae",
    "Serratia marcescens": "Enterobacteriaceae",
    "Proteus mirabilis": "Enterobacteriaceae",
    "Proteus vulgaris": "Enterobacteriaceae",
    "Citrobacter freundii": "Enterobacteriaceae",
    "Citrobacter koseri": "Enterobacteriaceae",
    "Morganella morganii": "Enterobacteriaceae",
    "Providencia stuartii": "Enterobacteriaceae",
    "Providencia rettgeri": "Enterobacteriaceae",
    "Enterococcus faecalis": "Enterococcaceae",
    "Enterococcus faecium": "Enterococcaceae",
    "Staphylococcus aureus": "Staphylococcaceae",
    "Staphylococcus capitis": "Staphylococcaceae",
    "Staphylococcus cohnii": "Staphylococcaceae",
    "Staphylococcus epidermidis": "Staphylococcaceae",
    "Staphylococcus hominis": "Staphylococcaceae",
    "Staphylococcus hyicus": "Staphylococcaceae",
    "Staphylococcus lugdunensis": "Staphylococcaceae",
    "Staphylococcus pettenkoferi": "Staphylococcaceae",
    "Staphylococcus saprophyticus": "Staphylococcaceae",
    "Staphylococcus schleiferi": "Staphylococcaceae",
    "Staphylococcus sciuri": "Staphylococcaceae",
    "Staphylococcus simulans": "Staphylococcaceae",
    "Staphylococcus warneri": "Staphylococcaceae",
    "Staphylococcus xylosus": "Staphylococcaceae",
    "Streptococcus pyogenes": "Streptococcaceae",
    "Streptococcus agalactiae": "Streptococcaceae",
    "Streptococcus dysgalactiae": "Streptococcaceae",
    "Streptococcus canis": "Streptococcaceae",
    "Streptococcus pneumoniae": "Streptococcaceae",
    "Pseudomonas aeruginosa": "Pseudomonadaceae",
    "Haemophilus influenzae": "Pasteurellaceae",
    }

# Creating a new column "Family" based on the mapping
df['Family'] = df['Species'].map(species_to_family)

# Reordering the columns to place "Family" right after "Species"
cols = df.columns.tolist()
species_index = cols.index('Species')
cols.insert(species_index + 1, cols.pop(cols.index('Family')))
df = df[cols]

In [28]:
# Filtering the dataframe to include only the data from the years 2018 to 2022
df = df[df['Year'].isin([2018, 2019, 2020, 2021, 2022])]

In [29]:
# Count the number of unique isolates in the "Isolate Number" column
number_of_isolates = df['Isolate Number'].nunique()

# Display the total number of unique isolates
number_of_isolates


353553

In [30]:
# Calculate the distribution of isolates based on the "Data Source" column
isolate_distribution = df['Data Source'].value_counts()

# Display the distribution of isolates
isolate_distribution


Data Source
ATLAS       283919
KEYSTONE     40269
GEARS        29365
Name: count, dtype: int64

In [31]:
# Calculate the distribution of isolates based on the "Species" column
species_distribution = df['Species'].value_counts()

# Display the distribution of species
species_distribution


Species
Staphylococcus aureus                                       53770
Pseudomonas aeruginosa                                      53544
Escherichia coli                                            46080
Klebsiella pneumoniae                                       39969
Acinetobacter baumannii                                     15329
Enterococcus faecalis                                       13445
Streptococcus pneumoniae                                    11897
Enterobacter cloacae                                         8112
Serratia marcescens                                          6891
Haemophilus influenzae                                       6836
Proteus mirabilis                                            6789
Enterococcus faecium                                         6649
Klebsiella oxytoca                                           6466
Staphylococcus epidermidis                                   6145
Streptococcus agalactiae                                     5289
St

In [32]:
# Calculate the distribution of species based on their families with percentages rounded to 2 decimal places
family_distribution = df['Family'].value_counts(normalize=True) * 100
family_distribution = family_distribution.round(2)

# Display the distribution of species by family in percentage with 2 decimal places
family_distribution


Family
Enterobacteriaceae    45.25
Staphylococcaceae     20.25
Pseudomonadaceae      17.93
Streptococcaceae       7.54
Enterococcaceae        6.73
Pasteurellaceae        2.29
Name: proportion, dtype: float64

In [33]:
# Modifying the "Gender" column to only have "Male", "Female", and "Unknown"
df['Gender'] = df['Gender'].apply(lambda x: x if x in ['Male', 'Female'] else 'Unknown')

# Display the unique values in the modified "Gender" column to confirm changes
df['Gender'].value_counts()

Gender
Male       200080
Female     150374
Unknown      3099
Name: count, dtype: int64

In [34]:
# After modifying the "Gender" column, now calculating the value counts and percentages
gender_counts_percentage = df['Gender'].value_counts(normalize=True) * 100
gender_counts_percentage_rounded = gender_counts_percentage.round(2)

# Displaying the modified gender counts with percentages
gender_counts_percentage_rounded

Gender
Male       56.59
Female     42.53
Unknown     0.88
Name: proportion, dtype: float64

In [35]:
# Creating a pie chart using Plotly for the modified gender distribution
fig = px.pie(
    names=gender_counts_percentage_rounded.index,
    values=gender_counts_percentage_rounded,
    title="Percentages of Gender Distribution",
    color_discrete_sequence=px.colors.qualitative.Pastel
)

# Adjusting the size of the chart
fig.update_layout(
    width=500,  # Set the width of the chart
    height=400  # Set the height of the chart
)
# Display the pie chart
fig.show()


In [36]:
# ACalculate the counts for each `Age Group`
age_group_counts = df['Age Group'].value_counts()

age_group_counts

Age Group
19 to 64 Years    158107
65 to 84 Years    123909
85 and Over        25885
0 to 2 Years       18982
3 to 12 Years      14568
13 to 18 Years      7205
Unknown             4897
Name: count, dtype: int64

In [37]:
# Custom sorting order for the provided age groups
age_group_order = [
    "0 to 2 Years", "3 to 12 Years", "13 to 18 Years",
    "19 to 64 Years", "65 to 84 Years", "85 and Over", "Unknown"
]

# Converting 'Age Group' to a categorical type with the specified order
df['Age Group'] = pd.Categorical(df['Age Group'], categories=age_group_order, ordered=True)

# Sorting the value counts based on the specified order
age_group_counts_sorted = df['Age Group'].value_counts().sort_index()

# Creating a bar graph using Plotly Express with a single color (blue)
fig = px.bar(
    x=age_group_counts_sorted.index,
    y=age_group_counts_sorted.values,
    title="Age Group Distribution",
    labels={'x':'Age Group', 'y':'Count'},
    color_discrete_sequence=['blue']  # Set the color of all bars to blue
)

# Display the bar chart
fig.show()



In [38]:
# Check the unique countries in the dataset
unique_countries = df['Country'].unique()

# Display the list of unique countries
print(unique_countries)


['Spain' 'Mexico' 'United States' 'Israel' 'Czech Republic' 'Colombia'
 'Taiwan' 'Hungary' 'Belgium' 'France' 'Philippines' 'Korea, South'
 'China' 'Greece' 'India' 'South Africa' 'Portugal' 'Lithuania'
 'United Kingdom' 'Latvia' 'Italy' 'Dominican Republic' 'Germany'
 'Australia' 'Venezuela' 'Panama' 'Finland' 'Canada' 'Japan' 'Switzerland'
 'Netherlands' 'Costa Rica' 'Thailand' 'Brazil' 'Hong Kong' 'Nigeria'
 'Croatia' 'Turkey' 'Argentina' 'Poland' 'Denmark' 'Chile' 'Singapore'
 'Ireland' 'Guatemala' 'Sweden' 'Saudi Arabia' 'Kuwait' 'Ukraine'
 'Romania' 'Russia' 'Morocco' 'Jordan' 'Malaysia' 'Ivory Coast'
 'New Zealand' 'Qatar' 'Slovenia' 'Cameroon' 'Bulgaria' 'Malawi' 'Kenya'
 'Uganda' 'Ghana' 'USA' 'UK' 'Belarus' 'Ecuador' 'Puerto Rico' 'Vietnam']


In [39]:
# Create a dataframe for the countries
data = pd.DataFrame({'Country': unique_countries, 'Value': [1] * len(unique_countries)})

# Plotting the world map with the countries highlighted using a uniform color
fig = px.choropleth(
    data,
    locations="Country",
    locationmode="country names",
    hover_name="Country",
    title="Participating Countries",
    color_discrete_sequence=['#1f77b4'],  # Set a single color for all countries (blue)
)

# Remove the color scale by disabling color axis
fig.update_layout(coloraxis_showscale=False)

# Adjust the size of the figure to reduce white space
fig.update_layout(
    width=800,  # Set the width of the map
    height=500  # Set the height of the map
)

# Display the map
fig.show()


In [40]:
# Filtering the data to include only the years 2018, 2019, 2020, 2021, and 2022
filtered_yearly_data = df[df['Year'].isin([2018, 2019, 2020, 2021, 2022])]

# Grouping data by year and counting the number of entries for each year
yearly_data = filtered_yearly_data['Year'].value_counts().sort_index()

# Creating the time series plot using Plotly Express
fig = px.line(
    x=yearly_data.index,
    y=yearly_data.values,
    labels={'x': 'Year', 'y': 'Number of Entries'},
    title='Time Series Analysis of Isolate Collection (2018-2022)',
    markers=True
)

# Adjusting the x-axis to show only the selected years
fig.update_layout(
    xaxis=dict(tickmode='array', tickvals=[2018, 2019, 2020, 2021, 2022]),
    yaxis=dict(tickformat=',', title_font=dict(size=16)),  # Display full numbers with commas
    yaxis_title="Number of Entries",
    font=dict(size=14),  # Adjusting the font size for the plot
)

# Display the plot
fig.show()


In [41]:
# Getting the unique texts from the 'Source of Infection' column
unique_sources_of_infection = df['Source of Infection'].unique()

# Display the unique values
unique_sources_of_infection


array(['Abscess', 'Sputum', 'Urine', 'Peritoneal Fluid', 'Wound',
       'Endotracheal aspirate', 'Furuncle', 'Bronchus', 'Gastric Abscess',
       'Ulcer', 'Blood', 'None Given', 'Bronchoalveolar lavage',
       'Gall Bladder', 'Respiratory: Other', 'Pancreas', 'Stomach',
       'Burn', 'Cellulitis', 'Lungs', 'Thoracentesis Fluid',
       'Skin: Other', 'Intestinal: Other', 'Decubitus', 'Diverticulum',
       'Appendix', 'Kidney', 'Carbuncle', 'Impetiginous lesions',
       'Urethra', 'Colon', 'Bladder', 'Liver', 'Genitourinary: Other',
       'Prostate', 'Ureter', 'Tissue Fluid', 'Bile', 'Instruments: Other',
       'Ear', 'Abdominal Fluid', 'Skin', 'Feces/Stool', 'Bone', 'Exudate',
       'Respiratory: Sinuses', 'Penis', 'Vagina', 'Throat',
       'Pleural Fluid', 'Mouth', 'Rectum', 'Circulatory: Other', 'Nails',
       'Aspirate', 'HEENT: Other', 'Integumentary (Skin Nail Hair)',
       'Eye', 'Brain', 'Lymph Nodes', 'Bronchiole', 'Bodily Fluids',
       'Peripheral Nerves', 'Spina

In [42]:
# Define a function to classify sources into categories
def classify_source(source):
    respiratory = [
        'Sputum', 'Endotracheal aspirate', 'Bronchus', 'Bronchoalveolar lavage',
        'Lungs', 'Trachea', 'Lower respiratory tract', 'Nasopharyngeal Aspirate',
        'Upper Respiratory Tract', 'Respiratory: Sputum', 'Respiratory: Bronchoalveolar lavage',
        'Respiratory: Endotracheal aspirate', 'Respiratory: Bronchial brushing', 'Respiratory: Lungs',
        'Respiratory: Bronchials', 'Respiratory: Sinuses'
    ]
    
    urinary_gu = [
        'Urine', 'Kidney', 'Ureter', 'Urethra', 'Bladder', 'Prostate', 'Genitourinary: Other',
        'Urethral catheter (Foley)', 'GU: Urine', 'GU: Kidneys', 'GU: Urinary Bladder', 
        'GU: Prostate', 'GU: Ureter', 'GU: Other'
    ]
    
    gastrointestinal = [
        'Peritoneal Fluid', 'Gall Bladder', 'Stomach', 'Pancreas', 'Intestinal: Other', 
        'Colon', 'Appendix', 'Ileum', 'GI tract/Bowel', 'GI: Gall Bladder', 'GI: Stomach', 
        'GI: Appendix', 'GI: Large Colon', 'GI: Small Colon', 'GI: Liver', 'GI: Pancreas', 
        'GI: Other', 'GI: Diverticulum', 'GI: Abscess'
    ]
    
    skin_integumentary = [
        'Abscess', 'Furuncle', 'Ulcer', 'Wound', 'Burn', 'Skin: Other', 'Cellulitis', 'Decubitus',
        'Carbuncle', 'Impetiginous lesions', 'Pyoderma Lesion', 'Integumentary (Skin Nail Hair)',
        'Skin', 'INT: Abscess', 'INT: Burn', 'INT: Decubitus', 'INT: Skin Ulcer', 'INT: Wound',
        'INT: Cellulitis/Erysipelas', 'INT: Furuncle', 'INT: Carbuncle', 'INT: Impetiginous lesions', 
        'Bodily Fluids: Abscess / Pus'
    ]
    
    circulatory_cvs = ['Blood', 'Blood culture', 'CVS: Blood', 'Endocarditis']
    
    cns = ['Brain', 'Spinal Cord', 'Cerebrospinal Fluid', 'CSF']
    
    musculoskeletal = ['Bone', 'Muscle', 'Bone Marrow', 'Bone/Joint culture', 'Bone/Joint fluid']
    
    reproductive = ['Ovary', 'Testis', 'Placenta', 'Cervix', 'Fallopian Tubes', 'Uterus', 'Genital tract']
    
    lymphatic_immune = ['Lymph Nodes', 'Lymphatic Fluid']
    
    heent = [
        'Ear', 'Throat', 'Nose', 'Mouth', 'Eye', 'Ears, nose, throat', 'HEENT: Eyes', 
        'HEENT: Other', 'Eye/conjunctiva', 'Ears, nose, throat, sinus'
    ]
    
    other_bodily_fluids = [
        'Peritoneal fluid', 'Thoracentesis Fluid', 'Tissue Fluid', 'Bile', 'Abdominal Fluid', 
        'Exudate', 'Pleural Fluid', 'Bodily Fluids', 'Bodily Fluids: Peritoneal', 'Bodily Fluids: Thoracentesis',
        'Bodily Fluids: Tissue'
    ]
    
    instrumentation = [
        'Catheters', 'Drains', 'Dialysis Line', 'Catheter tip', 'Intravenous/IV Line', 'Ureteral catheter'
    ]
    
    unknown_unspecified = ['None Given', 'Unknown', 'Other']
    
    # Classify the source
    if source in respiratory:
        return 'Respiratory System'
    elif source in urinary_gu:
        return 'Urinary/Genitourinary System'
    elif source in gastrointestinal:
        return 'Gastrointestinal System'
    elif source in skin_integumentary:
        return 'Skin/Integumentary System'
    elif source in circulatory_cvs:
        return 'Circulatory/Cardiovascular System'
    elif source in cns:
        return 'Central Nervous System'
    elif source in musculoskeletal:
        return 'Musculoskeletal System'
    elif source in reproductive:
        return 'Reproductive System'
    elif source in lymphatic_immune:
        return 'Lymphatic/Immune System'
    elif source in heent:
        return 'HEENT (Head, Eyes, Ears, Nose, Throat)'
    elif source in other_bodily_fluids:
        return 'Other/Bodily Fluids'
    elif source in instrumentation:
        return 'Instrumentation/Devices'
    elif source in unknown_unspecified:
        return 'Unknown/Unspecified'
    else:
        return 'Other'

# Apply the classification function to your data
df['Source Category'] = df['Source of Infection'].apply(classify_source)

# Display the result
df[['Source of Infection', 'Source Category']].head()


Unnamed: 0,Source of Infection,Source Category
315210,Abscess,Skin/Integumentary System
315211,Sputum,Respiratory System
315212,Urine,Urinary/Genitourinary System
315213,Peritoneal Fluid,Gastrointestinal System
315214,Urine,Urinary/Genitourinary System


In [43]:
# Set the source category as source of infection and drop source of infection
df['Source of Infection'] = df['Source Category']
df.drop(columns=['Source Category'], inplace=True, axis=1)

In [44]:
# List of antibiotics
antibiotics = [
    'Amoxicillin-clavulanate', 'Ampicillin', 'Piperacillin tazobactam', 'Ceftriaxone', 'Ceftazidime',
    'Cefepime', 'Imipenem', 'Meropenem', 'Ciprofloxacin', 'Levofloxacin', 'Amikacin', 'Gentamicin',
    'Vancomycin', 'Azithromycin', 'Clarithromycin', 'Tigecycline', 'Linezolid', 'Trimethoprim-sulfamethoxazole'
    ]

# Loop through each antibiotic and insert the corresponding _I column after it
for antibiotic in antibiotics:
    # Find the position of the antibiotic column
    pos = df.columns.get_loc(antibiotic) + 1

    # Insert the new _I column right after the antibiotic column
    df.insert(pos, antibiotic + '_I', None)

# Print the first few rows of the updated data
df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315210,1773105,ATLAS,Citrobacter freundii,Enterobacteriaceae,Male,19 to 64 Years,Spain,2018,Skin/Integumentary System,>16,,>16,,1,,,,0.5,,<=0.12,,2.0,,<=0.06,,<=0.12,,<=0.25,,2,,0.5,,,,,,,,0.25,,,,<=1,
315211,1773106,ATLAS,Citrobacter freundii,Enterobacteriaceae,Female,65 to 84 Years,Spain,2018,Respiratory System,>16,,>16,,32,,,,128.0,,1,,0.5,,<=0.06,,<=0.12,,<=0.25,,1,,0.5,,,,,,,,0.25,,,,<=1,
315212,1773107,ATLAS,Citrobacter freundii,Enterobacteriaceae,Female,65 to 84 Years,Spain,2018,Urinary/Genitourinary System,>16,,>16,,2,,,,0.5,,<=0.12,,1.0,,<=0.06,,<=0.12,,<=0.25,,1,,0.5,,,,,,,,0.25,,,,<=1,
315213,1773108,ATLAS,Citrobacter freundii,Enterobacteriaceae,Male,65 to 84 Years,Spain,2018,Gastrointestinal System,>16,,>16,,2,,,,0.5,,<=0.12,,1.0,,0.25,,<=0.12,,<=0.25,,2,,0.25,,,,,,,,0.25,,,,<=1,
315214,1773109,ATLAS,Citrobacter koseri,Enterobacteriaceae,Male,65 to 84 Years,Spain,2018,Urinary/Genitourinary System,4,,>16,,2,,,,0.25,,<=0.12,,0.12,,<=0.06,,<=0.12,,<=0.25,,1,,0.25,,,,,,,,0.25,,,,<=1,


In [45]:
# Function to filter the dataframe by family
def filter_by_family(df, family_name):
    """
    Filters the dataframe by the provided family name.

    Parameters:
    df (DataFrame): The input dataframe.
    family_name (str): The family name to filter by.

    Returns:
    DataFrame: Filtered dataframe.
    """
    filtered_df = df[df['Family'] == family_name].copy()
    return filtered_df

# Function to filter Streptococcus pneumoniae separately from Streptococcaceae
def filter_streptococcus_pneumoniae(df):
    """
    Filters the dataframe for Streptococcus pneumoniae in the Streptococcaceae family.

    Parameters:
    df (DataFrame): The input dataframe.

    Returns:
    DataFrame: Filtered dataframe containing only Streptococcus pneumoniae.
    """
    filtered_df = df[(df['Family'] == 'Streptococcaceae') & (df['Species'] == 'Streptococcus pneumoniae')].copy()
    return filtered_df

# Function to filter the rest of Streptococcaceae without Streptococcus pneumoniae
def filter_streptococcaceae_without_pneumoniae(df):
    """
    Filters the dataframe for Streptococcaceae family excluding Streptococcus pneumoniae.

    Parameters:
    df (DataFrame): The input dataframe.

    Returns:
    DataFrame: Filtered dataframe excluding Streptococcus pneumoniae.
    """
    filtered_df = df[(df['Family'] == 'Streptococcaceae') & (df['Species'] != 'Streptococcus pneumoniae')].copy()
    return filtered_df

# Function to filter Staphylococcaceae family
def filter_staphylococcaceae(df):
    """
    Filters the dataframe for the Staphylococcaceae family.

    Parameters:
    df (DataFrame): The input dataframe.

    Returns:
    DataFrame: Filtered dataframe containing only Staphylococcaceae.
    """
    filtered_df = df[df['Family'] == 'Staphylococcaceae'].copy()
    return filtered_df

# Example usage for Enterobacteriaceae
enterobacteriaceae_df = filter_by_family(df, 'Enterobacteriaceae')

# Example usage for Pseudomonadaceae
pseudomonadaceae_df = filter_by_family(df, 'Pseudomonadaceae')

# Example usage for Streptococcaceae excluding Streptococcus pneumoniae
streptococcaceae_without_pneumoniae_df = filter_streptococcaceae_without_pneumoniae(df)

# Example usage for Streptococcus pneumoniae
streptococcus_pneumoniae_df = filter_streptococcus_pneumoniae(df)

# Example usage for Enterococcaceae
enterococcaceae_df = filter_by_family(df, 'Enterococcaceae')

# Example usage for Pasteurellaceae
pasteurellaceae_df = filter_by_family(df, 'Pasteurellaceae')

# Example usage for Staphylococcaceae
staphylococcaceae_df = filter_staphylococcaceae(df)


In [46]:
# Display filtered dataframes
enterobacteriaceae_df.head()


Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315210,1773105,ATLAS,Citrobacter freundii,Enterobacteriaceae,Male,19 to 64 Years,Spain,2018,Skin/Integumentary System,>16,,>16,,1,,,,0.5,,<=0.12,,2.0,,<=0.06,,<=0.12,,<=0.25,,2,,0.5,,,,,,,,0.25,,,,<=1,
315211,1773106,ATLAS,Citrobacter freundii,Enterobacteriaceae,Female,65 to 84 Years,Spain,2018,Respiratory System,>16,,>16,,32,,,,128.0,,1,,0.5,,<=0.06,,<=0.12,,<=0.25,,1,,0.5,,,,,,,,0.25,,,,<=1,
315212,1773107,ATLAS,Citrobacter freundii,Enterobacteriaceae,Female,65 to 84 Years,Spain,2018,Urinary/Genitourinary System,>16,,>16,,2,,,,0.5,,<=0.12,,1.0,,<=0.06,,<=0.12,,<=0.25,,1,,0.5,,,,,,,,0.25,,,,<=1,
315213,1773108,ATLAS,Citrobacter freundii,Enterobacteriaceae,Male,65 to 84 Years,Spain,2018,Gastrointestinal System,>16,,>16,,2,,,,0.5,,<=0.12,,1.0,,0.25,,<=0.12,,<=0.25,,2,,0.25,,,,,,,,0.25,,,,<=1,
315214,1773109,ATLAS,Citrobacter koseri,Enterobacteriaceae,Male,65 to 84 Years,Spain,2018,Urinary/Genitourinary System,4,,>16,,2,,,,0.25,,<=0.12,,0.12,,<=0.06,,<=0.12,,<=0.25,,1,,0.25,,,,,,,,0.25,,,,<=1,


In [47]:
pseudomonadaceae_df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315314,1773202,ATLAS,Pseudomonas aeruginosa,Pseudomonadaceae,Female,65 to 84 Years,Spain,2018,Respiratory System,>16,,>16,,1,,,,2,,4,,1,,<=0.06,,0.25,,2,,2,,1,,,,,,,,>8,,,,16,
315315,1773203,ATLAS,Pseudomonas aeruginosa,Pseudomonadaceae,Male,65 to 84 Years,Spain,2018,Respiratory System,>16,,>16,,64,,,,16,,16,,>8,,>16,,0.5,,4,,4,,2,,,,,,,,>8,,,,>32,
315316,1773204,ATLAS,Pseudomonas aeruginosa,Pseudomonadaceae,Female,85 and Over,Spain,2018,Skin/Integumentary System,>16,,>16,,>64,,,,32,,16,,8,,2,,>4,,>8,,16,,8,,,,,,,,>8,,,,>32,
315317,1773205,ATLAS,Pseudomonas aeruginosa,Pseudomonadaceae,Female,85 and Over,Spain,2018,Skin/Integumentary System,>16,,>16,,>64,,,,32,,16,,2,,2,,>4,,>8,,16,,8,,,,,,,,>8,,,,>32,
315318,1773206,ATLAS,Pseudomonas aeruginosa,Pseudomonadaceae,Male,85 and Over,Spain,2018,Skin/Integumentary System,>16,,>16,,64,,,,16,,2,,2,,0.25,,<=0.12,,0.5,,8,,2,,,,,,,,8,,,,16,


In [48]:
streptococcaceae_without_pneumoniae_df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315412,1773292,ATLAS,Streptococcus agalactiae,Streptococcaceae,Female,19 to 64 Years,Spain,2018,Skin/Integumentary System,,,,,<=0.25,,0.06,,,,,,,,0.12,,,,1.0,,,,,,0.5,,,,,,0.03,,1.0,,,
315413,1773293,ATLAS,Streptococcus pyogenes,Streptococcaceae,Female,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,,,<=0.25,,0.03,,,,,,,,<=0.03,,,,1.0,,,,,,0.5,,,,,,0.03,,0.5,,,
315414,1773294,ATLAS,Streptococcus pyogenes,Streptococcaceae,Male,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,,,<=0.25,,0.03,,,,,,,,<=0.03,,,,0.5,,,,,,0.25,,,,,,0.03,,1.0,,,
315415,1773295,ATLAS,Streptococcus pyogenes,Streptococcaceae,Female,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,,,<=0.25,,<=0.015,,,,,,,,<=0.03,,,,0.5,,,,,,0.5,,,,,,0.03,,1.0,,,
315416,1773296,ATLAS,Streptococcus dysgalactiae,Streptococcaceae,Female,19 to 64 Years,Spain,2018,Skin/Integumentary System,,,,,<=0.25,,0.03,,,,,,,,<=0.03,,,,1.0,,,,,,0.25,,,,,,0.03,,1.0,,,


In [49]:
streptococcus_pneumoniae_df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315423,1773302,ATLAS,Streptococcus pneumoniae,Streptococcaceae,Female,19 to 64 Years,Spain,2018,Respiratory System,,,,,<=0.25,,<=0.015,,,,,,,,<=0.03,,,,1,,,,,,0.25,,,,,,0.015,,1,,,
315424,1773303,ATLAS,Streptococcus pneumoniae,Streptococcaceae,Male,19 to 64 Years,Spain,2018,Respiratory System,,,,,<=0.25,,<=0.015,,,,,,,,<=0.03,,,,1,,,,,,0.25,,,,,,0.015,,1,,,
315425,1773304,ATLAS,Streptococcus pneumoniae,Streptococcaceae,Female,65 to 84 Years,Spain,2018,Respiratory System,,,,,<=0.25,,<=0.015,,,,,,,,<=0.03,,,,1,,,,,,0.25,,,,,,0.03,,1,,,
315426,1773305,ATLAS,Streptococcus pneumoniae,Streptococcaceae,Female,0 to 2 Years,Spain,2018,Respiratory System,,,,,<=0.25,,0.03,,,,,,,,<=0.03,,,,1,,,,,,0.25,,,,,,0.015,,1,,,
315427,1773306,ATLAS,Streptococcus pneumoniae,Streptococcaceae,Male,19 to 64 Years,Spain,2018,Respiratory System,,,,,0.5,,0.12,,,,,,,,0.12,,,,1,,,,,,0.5,,,,,,0.015,,1,,,


In [50]:
enterococcaceae_df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315346,1773232,ATLAS,Enterococcus faecalis,Enterococcaceae,Male,65 to 84 Years,Spain,2018,Circulatory/Cardiovascular System,,,1,,,,,,,,,,,,,,,,1,,,,16,,2,,,,,,0.06,,1,,>2,
315347,1773233,ATLAS,Enterococcus faecalis,Enterococcaceae,Male,65 to 84 Years,Spain,2018,Circulatory/Cardiovascular System,,,1,,,,,,,,,,,,,,,,1,,,,8,,2,,,,,,0.03,,2,,<=0.03,
315348,1773234,ATLAS,Enterococcus faecalis,Enterococcaceae,Male,19 to 64 Years,Spain,2018,Other,,,2,,,,,,,,,,,,,,,,>4,,,,>16,,1,,,,,,0.06,,2,,>2,
315349,1773235,ATLAS,Enterococcus faecalis,Enterococcaceae,Female,19 to 64 Years,Spain,2018,Circulatory/Cardiovascular System,,,1,,,,,,,,,,,,,,,,1,,,,8,,2,,,,,,0.03,,2,,0.06,
315350,1773236,ATLAS,Enterococcus faecalis,Enterococcaceae,Female,19 to 64 Years,Spain,2018,Skin/Integumentary System,,,1,,,,,,,,,,,,,,,,1,,,,16,,1,,,,,,0.06,,2,,0.06,


In [51]:
pasteurellaceae_df.head()

Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315263,1773155,ATLAS,Haemophilus influenzae,Pasteurellaceae,Female,85 and Over,Spain,2018,Respiratory System,2,,>8,,<=0.03,,<=0.03,,0.06,,,,,,0.12,,,,0.03,,,,,,,,1,,,,0.06,,,,,
315264,1773157,ATLAS,Haemophilus influenzae,Pasteurellaceae,Male,3 to 12 Years,Spain,2018,Respiratory System,<=0.5,,<=0.5,,<=0.03,,<=0.03,,<=0.03,,,,,,0.03,,,,0.03,,,,,,,,1,,,,0.06,,,,,
315265,1773158,ATLAS,Haemophilus influenzae,Pasteurellaceae,Female,3 to 12 Years,Spain,2018,Respiratory System,2,,1,,<=0.03,,0.06,,0.12,,,,,,0.25,,,,0.015,,,,,,,,1,,,,0.06,,,,,
315266,1773159,ATLAS,Haemophilus influenzae,Pasteurellaceae,Male,19 to 64 Years,Spain,2018,Respiratory System,2,,1,,<=0.03,,<=0.03,,0.12,,,,,,0.12,,,,0.015,,,,,,,,1,,,,0.12,,,,,
315268,1773160,ATLAS,Haemophilus influenzae,Pasteurellaceae,Male,3 to 12 Years,Spain,2018,Respiratory System,<=0.5,,<=0.5,,<=0.03,,<=0.03,,0.12,,,,,,0.06,,,,0.015,,,,,,,,1,,,,0.06,,,,,


In [52]:
staphylococcaceae_df.head()


Unnamed: 0,Isolate Number,Data Source,Species,Family,Gender,Age Group,Country,Year,Source of Infection,Amoxicillin-clavulanate,Amoxicillin-clavulanate_I,Ampicillin,Ampicillin_I,Piperacillin tazobactam,Piperacillin tazobactam_I,Ceftriaxone,Ceftriaxone_I,Ceftazidime,Ceftazidime_I,Cefepime,Cefepime_I,Imipenem,Imipenem_I,Meropenem,Meropenem_I,Ciprofloxacin,Ciprofloxacin_I,Levofloxacin,Levofloxacin_I,Amikacin,Amikacin_I,Gentamicin,Gentamicin_I,Vancomycin,Vancomycin_I,Azithromycin,Azithromycin_I,Clarithromycin,Clarithromycin_I,Tigecycline,Tigecycline_I,Linezolid,Linezolid_I,Trimethoprim-sulfamethoxazole,Trimethoprim-sulfamethoxazole_I
315368,1773252,ATLAS,Staphylococcus epidermidis,Staphylococcaceae,Male,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,>8,,,,,,,,,,,,,,,,>4,,,,>16,,1,,,,,,0.25,,1,,>2,
315369,1773253,ATLAS,Staphylococcus epidermidis,Staphylococcaceae,Female,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,<=0.25,,,,,,,,,,,,,,,,0.25,,,,<=1,,2,,,,,,0.12,,1,,0.25,
315370,1773254,ATLAS,Staphylococcus epidermidis,Staphylococcaceae,Male,19 to 64 Years,Spain,2018,Gastrointestinal System,,,>8,,,,,,,,,,,,,,,,>4,,,,>16,,2,,,,,,0.12,,1,,>2,
315371,1773255,ATLAS,Staphylococcus epidermidis,Staphylococcaceae,Female,19 to 64 Years,Spain,2018,Skin/Integumentary System,,,>8,,,,,,,,,,,,,,,,>4,,,,>16,,2,,,,,,0.12,,>8,,>2,
315372,1773256,ATLAS,Staphylococcus epidermidis,Staphylococcaceae,Female,65 to 84 Years,Spain,2018,Skin/Integumentary System,,,>8,,,,,,,,,,,,,,,,>4,,,,>16,,2,,,,,,0.12,,>8,,>2,


In [53]:
# Define the file path where the CSV files will be saved
file_path = r'../../data/processed'

# Export each DataFrame to CSV format in the specified file path
enterobacteriaceae_df.to_csv(f'{file_path}/enterobacteriaceae_df.csv', index=False)
pseudomonadaceae_df.to_csv(f'{file_path}/pseudomonadaceae_df.csv', index=False)
streptococcaceae_without_pneumoniae_df.to_csv(f'{file_path}/streptococcaceae_without_pneumoniae_df.csv', index=False)
streptococcus_pneumoniae_df.to_csv(f'{file_path}/streptococcus_pneumoniae_df.csv', index=False)
enterococcaceae_df.to_csv(f'{file_path}/enterococcaceae_df.csv', index=False)
pasteurellaceae_df.to_csv(f'{file_path}/pasteurellaceae_df.csv', index=False)
staphylococcaceae_df.to_csv(f'{file_path}/staphylococcaceae_df.csv', index=False)
