In [None]:
import pandas as pd
df = pd.read_csv("Time series of resistance to antibiotics (2018-2023)_All-BLOOD.csv", sep=';')
df.head()
df.info()
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Filters  146 non-null    object
dtypes: object(1)
memory usage: 1.3+ KB


Unnamed: 0,Filters
count,146
unique,146
top,Region filter: All
freq,1


In [None]:
# This helps us see the raw structure of the file
with open(
    "Time series of resistance to antibiotics (2018-2023)_All-BLOOD.csv",
    "r",
    encoding="utf-8"
) as f:
    # Read only the first 30 lines of the file
    # This is enough to see:
    # - metadata / filters
    # - header rows
    # - actual column names
    for i in range(30):
        # readline() reads one line at a time from the file
        # We number each line to identify where real data starts
        print(f"{i}: {f.readline()}")


0: Filters

1: Region filter: All

2: Infection type filter: Bloodstream

3: Bacterial pathogen filter: Acinetobacter spp.

4: Antibiotic filter: Amikacin

5: 

6: Data for time series

7: "Year","Specimen","PathogenName","AbTargets","Min","Q1","Median","Q3","Max"

8: 2018,"BLOOD","Acinetobacter spp.","Amikacin",0,23.07830182,46.499079378,66.346688926,92.366412214

9: 2019,"BLOOD","Acinetobacter spp.","Amikacin",0,9.097096189,40.547945205,70.042078057,91.111111111

10: 2020,"BLOOD","Acinetobacter spp.","Amikacin",0,13.333333333,56.221198157,76.61097852,94.827586207

11: 2021,"BLOOD","Acinetobacter spp.","Amikacin",0,16.75,46.366353743,73.485989126,98.113207547

12: 2022,"BLOOD","Acinetobacter spp.","Amikacin",0,13.443830571,49.570166799,74.852941176,100

13: 2023,"BLOOD","Acinetobacter spp.","Amikacin",0,12.220580435,47.823678379,66.596818454,94.418604651

14: 

15: 

16: Data for line plots for individual CTAs

17: "Iso3","CountryTerritoryArea","WHORegionName","Year","Specimen","Patho

In [None]:
import pandas as pd
# Read the CSV again
# - skip the first 17 lines (metadata + notes)
# - use comma as separator
df = pd.read_csv(
    "Time series of resistance to antibiotics (2018-2023)_All-BLOOD.csv",
    sep=",",
    skiprows=17
)
df.head()
df.info()
# Drop columns that are not useful for ML
df = df.drop(
    columns=[
        "Iso3",
        "Specimen",
        "TotalSpecimenIsolates",
        "InterpretableAST",
        "Resistant"
    ]
)

# Check remaining columns
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Iso3                   132 non-null    object 
 1   CountryTerritoryArea   132 non-null    object 
 2   WHORegionName          132 non-null    object 
 3   Year                   132 non-null    int64  
 4   Specimen               132 non-null    object 
 5   PathogenName           132 non-null    object 
 6   AbTargets              132 non-null    object 
 7   TotalSpecimenIsolates  132 non-null    int64  
 8   InterpretableAST       132 non-null    int64  
 9   Resistant              132 non-null    int64  
 10  PercentResistant       132 non-null    float64
dtypes: float64(1), int64(4), object(6)
memory usage: 11.5+ KB


Unnamed: 0,CountryTerritoryArea,WHORegionName,Year,PathogenName,AbTargets,PercentResistant
0,Argentina,Region of the Americas,2018,Acinetobacter spp.,Amikacin,22.478386
1,Bosnia and Herzegovina,European Region,2018,Acinetobacter spp.,Amikacin,92.366412
2,Brazil,Region of the Americas,2018,Acinetobacter spp.,Amikacin,40.625
3,Ethiopia,African Region,2018,Acinetobacter spp.,Amikacin,26.666667
4,Georgia,European Region,2018,Acinetobacter spp.,Amikacin,71.428571


In [None]:
# Function to convert percentage to resistance category
def resistance_level(percent):
    if percent < 30:
        return "Low"
    elif percent <= 60:
        return "Medium"
    else:
        return "High"

# Apply function to create target column
df["ResistanceLevel"] = df["PercentResistant"].apply(resistance_level)
# check
df[["PercentResistant", "ResistanceLevel"]].head()


Unnamed: 0,PercentResistant,ResistanceLevel
0,22.478386,Low
1,92.366412,High
2,40.625,Medium
3,26.666667,Low
4,71.428571,High


In [None]:
# Drop the raw percentage to avoid data leakage
df = df.drop(columns=["PercentResistant"], errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CountryTerritoryArea  132 non-null    object
 1   WHORegionName         132 non-null    object
 2   Year                  132 non-null    int64 
 3   PathogenName          132 non-null    object
 4   AbTargets             132 non-null    object
 5   ResistanceLevel       132 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.3+ KB


In [None]:
# Features (inputs)
X = df.drop(columns=["ResistanceLevel"])
# Target (output)
y = df["ResistanceLevel"]
# Check shapes
X.shape, y.shape


((132, 5), (132,))

In [None]:
#one hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical and numeric columns
categorical_cols = [
    "CountryTerritoryArea",
    "WHORegionName",
    "PathogenName",
    "AbTargets"
]

numeric_cols = ["Year"]

# Column transformer:
# - One-hot encode categorical columns
# - Pass numeric columns as they are
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# Apply transformation to features
X_encoded = preprocessor.fit_transform(X)
X_encoded.shape


(132, 31)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Check mapping
list(label_encoder.classes_)


['High', 'Low', 'Medium']

In [None]:
print("Encoded feature shape:", X_encoded.shape)
print("Encoded target shape:", y_encoded.shape)
# Sample check
y_encoded[:10]


Encoded feature shape: (132, 31)
Encoded target shape: (132,)


array([1, 0, 2, 1, 0, 0, 1, 2, 1, 0])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # important for class balance
)
# Check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((105, 31), (27, 31), (105,), (27,))

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
# multi_class='multinomial' for 3-class classification
model = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)




In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Quick look
y_pred[:10]


array([1, 0, 0, 2, 2, 2, 0, 2, 0, 2])

In [None]:
#Evaluation
#1 Accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

#2 Classification report
from sklearn.metrics import classification_report

print(
    classification_report(
        y_test,
        y_pred,
        target_names=label_encoder.classes_
    )
)

#3 Confusion matrix
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix


Test Accuracy: 0.7037037037037037
              precision    recall  f1-score   support

        High       0.82      0.75      0.78        12
         Low       1.00      0.57      0.73         7
      Medium       0.50      0.75      0.60         8

    accuracy                           0.70        27
   macro avg       0.77      0.69      0.70        27
weighted avg       0.77      0.70      0.71        27



array([[9, 0, 3],
       [0, 4, 3],
       [2, 0, 6]])

In [None]:
import joblib

# Save trained model
joblib.dump(model, "amr_resistance_model.pkl")

# Save preprocessing objects
joblib.dump(preprocessor, "amr_preprocessor.pkl")
joblib.dump(label_encoder, "amr_label_encoder.pkl")


['amr_label_encoder.pkl']