# Antimicrobial Resistance Data Exploration

**Objective:** Initial exploration and quality assessment of antimicrobial susceptibility testing (AST) data from 2025.

This analysis examines resistance patterns to inform empiric therapy selection and antimicrobial stewardship initiatives.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
%matplotlib inline
sns.set_style('whitegrid')
sns.set_palette('Set2')

## 1. Data Loading and Initial Assessment

In [None]:
# Load data
df = pd.read_excel('../data/raw/amr_data_2025.xlsx')

print(f"Total records: {len(df)}")
print(f"Total variables: {df.shape[1]}")
print(f"\nData collection period: {df['Sample Collection Date'].min()} to {df['Sample Collection Date'].max()}")

In [None]:
# Identify antibiotic columns
antibiotic_cols = [col for col in df.columns if ' - ' in col or col.startswith('NET_') or col.startswith('MET_')]
print(f"Number of antibiotics tested: {len(antibiotic_cols)}")
print(f"\nAntibiotics in dataset:")
for i, col in enumerate(antibiotic_cols, 1):
    ab_name = col.split(' - ')[1] if ' - ' in col else col.replace('_', ' ')
    print(f"{i}. {ab_name}")

## 2. Patient Demographics

In [None]:
# Age distribution
df_clean_age = df[df['Age (years)'] >= 0].copy()  # Remove invalid ages

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age histogram
axes[0].hist(df_clean_age['Age (years)'], bins=20, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Age (years)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution of Patients')
axes[0].axvline(df_clean_age['Age (years)'].median(), color='red', linestyle='--', label=f'Median: {df_clean_age["Age (years)"].median():.0f} years')
axes[0].legend()

# Gender distribution
gender_counts = df['Gender'].value_counts()
axes[1].bar(gender_counts.index, gender_counts.values, alpha=0.7)
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Count')
axes[1].set_title('Gender Distribution')
for i, v in enumerate(gender_counts.values):
    axes[1].text(i, v + 2, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/demographics.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Age statistics (n={len(df_clean_age)}):")
print(df_clean_age['Age (years)'].describe())
print(f"\nGender distribution:")
print(df['Gender'].value_counts())
print(f"Female proportion: {(df['Gender']=='Female').sum()/len(df)*100:.1f}%")

In [None]:
# Create age groups for clinical relevance
df_clean_age['Age_Group'] = pd.cut(df_clean_age['Age (years)'], 
                                     bins=[0, 18, 35, 50, 65, 100],
                                     labels=['Pediatric (<18)', 'Young Adult (18-35)', 
                                            'Middle Age (36-50)', 'Older Adult (51-65)', 
                                            'Elderly (>65)'])

age_group_dist = df_clean_age['Age_Group'].value_counts().sort_index()
print("\nAge group distribution:")
for group, count in age_group_dist.items():
    pct = count/len(df_clean_age)*100
    print(f"{group}: {count} ({pct:.1f}%)")

## 3. Sample Characteristics

In [None]:
# Sample type distribution
sample_dist = df['Sample Type'].value_counts()

plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(sample_dist)), sample_dist.values, alpha=0.7)
plt.xticks(range(len(sample_dist)), sample_dist.index)
plt.xlabel('Sample Type')
plt.ylabel('Number of Samples')
plt.title('Distribution of Sample Types')

for i, (idx, val) in enumerate(sample_dist.items()):
    pct = val/len(df)*100
    plt.text(i, val + 2, f'{val}\n({pct:.1f}%)', ha='center', fontweight='bold')

plt.savefig('../reports/figures/sample_types.png', dpi=300, bbox_inches='tight')
plt.show()

print("Clinical note: Urinary tract infections represent the majority of samples,")
print("which aligns with their high prevalence in outpatient and inpatient settings.")

## 4. Organism Distribution

In [None]:
# Clean organism data
organisms = df['Organism Identified'].value_counts()

plt.figure(figsize=(12, 6))
bars = plt.barh(organisms.index, organisms.values, alpha=0.7)
plt.xlabel('Number of Isolates')
plt.ylabel('Organism')
plt.title('Distribution of Bacterial Isolates (n={})'.format(organisms.sum()))
plt.gca().invert_yaxis()

for i, v in enumerate(organisms.values):
    pct = v/organisms.sum()*100
    plt.text(v + 1, i, f'{v} ({pct:.1f}%)', va='center')

plt.tight_layout()
plt.savefig('../reports/figures/organism_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 5 organisms:")
for org, count in organisms.head().items():
    pct = count/organisms.sum()*100
    print(f"{org}: {count} ({pct:.1f}%)")

In [None]:
# Classify organisms by Gram stain
gram_negative = ['E. coli', 'Klebsiella', 'Proteus', 'Pseudomonas']
gram_positive = ['S. aureus', 'Staphylococcus', 'Streptococcus']

df['Gram_Type'] = df['Organism Identified'].apply(
    lambda x: 'Gram-negative' if x in gram_negative else 
             ('Gram-positive' if x in gram_positive else 'Other/Unknown')
)

gram_dist = df['Gram_Type'].value_counts()
print("\nGram classification:")
for gram_type, count in gram_dist.items():
    pct = count/len(df)*100
    print(f"{gram_type}: {count} ({pct:.1f}%)")

plt.figure(figsize=(8, 6))
plt.pie(gram_dist.values, labels=gram_dist.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution by Gram Stain')
plt.savefig('../reports/figures/gram_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Data Quality Assessment

In [None]:
# Check missing data in key variables
key_vars = ['Age (years)', 'Gender', 'Sample Type', 'Organism Identified'] + antibiotic_cols

missing_summary = pd.DataFrame({
    'Variable': key_vars,
    'Missing_Count': [df[var].isna().sum() for var in key_vars],
    'Missing_Percentage': [df[var].isna().sum()/len(df)*100 for var in key_vars]
}).sort_values('Missing_Percentage', ascending=False)

print("Variables with >50% missing data:")
high_missing = missing_summary[missing_summary['Missing_Percentage'] > 50]
print(high_missing.head(10))
print(f"\nTotal antibiotics with >50% missing: {len(high_missing)-4}")  # Exclude non-antibiotic vars

In [None]:
# Completeness of antibiotic testing
ab_completeness = {}
for col in antibiotic_cols:
    non_missing = df[col].notna().sum()
    ab_completeness[col] = non_missing

ab_complete_df = pd.DataFrame(list(ab_completeness.items()), 
                               columns=['Antibiotic', 'Tests_Performed']).sort_values('Tests_Performed', ascending=False)

print("\nMost frequently tested antibiotics:")
print(ab_complete_df.head(15))

# Filter antibiotics with sufficient data for analysis (>30 tests)
sufficient_data_abs = ab_complete_df[ab_complete_df['Tests_Performed'] >= 30]['Antibiotic'].tolist()
print(f"\nAntibiotics with â‰¥30 tests (sufficient for analysis): {len(sufficient_data_abs)}")

## 6. Resistance Result Categories

In [None]:
# Standardize resistance categories
def categorize_result(result):
    if pd.isna(result):
        return np.nan
    result_str = str(result).upper()
    if 'R' in result_str or 'RESISTANT' in result_str:
        return 'Resistant'
    elif 'S' in result_str or 'SENSITIVE' in result_str or 'SUSCEPTIBLE' in result_str:
        return 'Sensitive'
    elif 'I' in result_str or 'INTERMEDIATE' in result_str:
        return 'Intermediate'
    else:
        return np.nan

# Apply to all antibiotic columns
for col in antibiotic_cols:
    df[col + '_Cat'] = df[col].apply(categorize_result)

# Check one example
example_ab = 'CIP - Ciprofloxacin'
print(f"Example: {example_ab}")
print(df[example_ab + '_Cat'].value_counts())
print(f"\nResistance rate: {(df[example_ab + '_Cat']=='Resistant').sum() / df[example_ab + '_Cat'].notna().sum() * 100:.1f}%")

## 7. Overall Resistance Overview

In [None]:
# Calculate overall resistance across all tests
all_results = []
for col in antibiotic_cols:
    results = df[col + '_Cat'].dropna()
    all_results.extend(results.tolist())

overall_dist = pd.Series(all_results).value_counts()
total_tests = len(all_results)

print(f"Total AST results analyzed: {total_tests}")
print(f"\nOverall distribution:")
for category, count in overall_dist.items():
    pct = count/total_tests*100
    print(f"{category}: {count} ({pct:.1f}%)")

overall_resistance_rate = (overall_dist.get('Resistant', 0) / total_tests * 100)
print(f"\n** Overall resistance rate: {overall_resistance_rate:.1f}% **")
print(f"\nInterpretation: {'HIGH - Concerning level of resistance' if overall_resistance_rate > 40 else 'MODERATE - Requires monitoring'}")

In [None]:
# Visualize overall distribution
plt.figure(figsize=(10, 6))
colors = {'Resistant': '#d62728', 'Sensitive': '#2ca02c', 'Intermediate': '#ff7f0e'}
bars = plt.bar(range(len(overall_dist)), overall_dist.values, 
               color=[colors.get(x, 'gray') for x in overall_dist.index], alpha=0.7)
plt.xticks(range(len(overall_dist)), overall_dist.index)
plt.ylabel('Number of Tests')
plt.title('Overall Distribution of Antimicrobial Susceptibility Test Results')

for i, (cat, val) in enumerate(overall_dist.items()):
    pct = val/total_tests*100
    plt.text(i, val + 50, f'{val}\n({pct:.1f}%)', ha='center', fontweight='bold')

plt.savefig('../reports/figures/overall_resistance_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Data Cleaning and Export

In [None]:
# Save cleaned dataset
df_export = df.copy()

# Remove system columns
system_cols = ['_id', '_uuid', '_submission_time', '_validation_status', '_notes', 
               '_status', '_submitted_by', '__version__', '_tags', '_index']
df_export = df_export.drop(columns=system_cols, errors='ignore')

# Export to CSV
df_export.to_csv('../data/processed/amr_data_2025_cleaned.csv', index=False)
print("Cleaned dataset saved to: data/processed/amr_data_2025_cleaned.csv")
print(f"Shape: {df_export.shape}")

## Summary

### Key Findings from Exploratory Analysis:

1. **Dataset Overview**
   - 183 clinical isolates with susceptibility testing
   - 53 different antibiotics tested
   - Data collected in 2025

2. **Patient Demographics**
   - Predominantly female patients (81.9%)
   - Mean age: 37 years
   - Age range: newborn to 85 years

3. **Sample Characteristics**
   - Majority urine samples (86.3%) - consistent with UTI focus
   - Small proportion of wound swabs and sputum

4. **Organism Profile**
   - Gram-negative bacteria predominate (54%)
   - Klebsiella most common (33.7%)
   - E. coli second (17.2%)
   - Significant Gram-positive isolates (S. aureus, Staphylococcus, Streptococcus)

5. **Overall Resistance**
   - Concerning resistance levels observed
   - Requires detailed antibiotic-specific analysis

### Next Steps:
- Detailed resistance prevalence by antibiotic
- Organism-specific resistance patterns
- Multi-drug resistance identification
- Treatment recommendations based on susceptibility data