<a href="https://colab.research.google.com/github/shivesh2334-ai/Antimicrobial-resistance-predictor/blob/main/Antimicrobial_resistance_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Resistance prediction workflow using scikit-learn and XGBoost:

## Step 1: Generate Dummy Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
from xgboost import XGBClassifier

# Set seed for reproducibility
np.random.seed(42)

n = 1000  # sample size
species = ['E. coli', 'Klebsiella spp.', 'Proteus spp.', 'Pseudomonas spp.', 'Acinetobacter spp.']
diseases = ['CHF', 'CKD', 'Tumor', 'Diabetes']

# Synthetic data
df = pd.DataFrame({
    'Age': np.random.randint(18, 90, n),
    'Gender': np.random.choice(['Male', 'Female'], n),
    'Species': np.random.choice(species, n),
    'Rectal_CPE_Pos': np.random.choice([0, 1], n, p=[0.7, 0.3]),
    'Setting': np.random.choice(['ICU', 'Internal Medicine'], n),
    'Acquisition': np.random.choice(['Community', 'Hospital'], n),
    'BSI_Source': np.random.choice(['Primary', 'Lung', 'Abdomen', 'UTI'], n),
    'CHF': np.random.choice([0, 1], n),
    'CKD': np.random.choice([0, 1], n),
    'Tumor': np.random.choice([0, 1], n),
    'Diabetes': np.random.choice([0, 1], n),
    'Immunosuppressed': np.random.choice([0, 1], n)
})

# Simulated resistance labels (you can add different rules here)
df['CR'] = ((df['Species'] == 'Klebsiella spp.') & (df['Rectal_CPE_Pos'] == 1)).astype(int)
df['BLBLI_R'] = ((df['Species'] == 'Pseudomonas spp.') | (df['CKD'] == 1)).astype(int)
df['FQR'] = ((df['Rectal_CPE_Pos'] == 1) & (df['Immunosuppressed'] == 1)).astype(int)
df['3GC_R'] = (df['Age'] > 65).astype(int)

In [None]:
# Display with better formatting (optional)
df.head(10).style.set_caption("Sample Rows from Resistance Prediction Dataset").set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('font-weight', 'bold')]}]
)

In [None]:
# One-hot encode categorical variables
X = df.drop(columns=['CR', 'BLBLI_R', 'FQR', '3GC_R'])
y_vars = ['CR', 'BLBLI_R', 'FQR', '3GC_R']

X_encoded = pd.get_dummies(X, drop_first=True)

results = {}

for y_col in y_vars:
    y = df[y_col]
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, stratify=y)

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    results[y_col] = {
        'AUROC': roc_auc_score(y_test, probs),
        'F1-score': f1_score(y_test, preds),
        'MCC': matthews_corrcoef(y_test, preds)
    }

In [None]:
results_df = pd.DataFrame(results).T
print(results_df.round(3))

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, matthews_corrcoef
import numpy as np
import pandas as pd

# --- Assume df is already created as before ---
y_column = 'CR'  # You can loop over ['CR', 'FQR', '3GC_R', 'BLBLI_R']
y = df[y_column]
X = df.drop(columns=['CR', 'FQR', '3GC_R', 'BLBLI_R'])

# --- Define Column Types ---
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(include=['int', 'float']).columns.tolist()

# --- Preprocessing Steps ---
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

# --- Models ---
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# --- Scoring Functions ---
scoring = {
    'AUROC': 'roc_auc',
    'F1': make_scorer(f1_score),
    'MCC': make_scorer(matthews_corrcoef)
}

# --- Evaluation using Cross-Validation ---
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(pipeline, X, y, cv=cv, scoring=scoring)

    results[name] = {
        'AUROC': np.mean(scores['test_AUROC']),
        'F1': np.mean(scores['test_F1']),
        'MCC': np.mean(scores['test_MCC'])
    }

# --- Results Summary ---
results_df = pd.DataFrame(results).T.round(3)
print("📊 Model Performance on Predicting", y_column)
print(results_df)

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, matthews_corrcoef
import numpy as np
import pandas as pd

# Setup scoring
scoring = {
    'AUROC': 'roc_auc',
    'F1': make_scorer(f1_score),
    'MCC': make_scorer(matthews_corrcoef)
}

# Prep features
y_targets = ['CR', 'FQR', '3GC_R', 'BLBLI_R']
X = df.drop(columns=y_targets)
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(include=['int', 'float']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

# Define models and parameter grids
model_grids = {
    'LogisticRegression': (
        LogisticRegression(max_iter=1000),
        {'classifier__C': [0.01, 0.1, 1, 10]}
    ),
    'RandomForest': (
        RandomForestClassifier(random_state=42),
        {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [5, 10, None]}
    )
}

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

nested_results = {}

# Loop over each label
for label in y_targets:
    y = df[label]
    nested_results[label] = {}

    for name, (model, param_grid) in model_grids.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        grid = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=3)
        scores = cross_val_score(grid, X, y, cv=outer_cv, scoring='roc_auc')

        nested_results[label][name] = {
            'Mean AUROC': round(np.mean(scores), 3),
            'Std Dev': round(np.std(scores), 3)
        }

In [None]:
# The following code was moved to the previous cell to fix the NameError:
# from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.metrics import make_scorer, roc_auc_score, f1_score, matthews_corrcoef
# import numpy as np
# import pandas as pd

# # Setup scoring
# scoring = {
#     'AUROC': 'roc_auc',
#     'F1': make_scorer(f1_score),
#     'MCC': make_scorer(matthews_corrcoef)
# }

# # Prep features
# y_targets = ['CR', 'FQR', '3GC_R', 'BLBLI_R']
# X = df.drop(columns=y_targets)
# categorical = X.select_dtypes(include='object').columns.tolist()
# numerical = X.select_dtypes(include=['int', 'float']).columns.tolist()

# preprocessor = ColumnTransformer([
#     ('num', StandardScaler(), numerical),
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
# ])

# # Define models and parameter grids
# model_grids = {
#     'LogisticRegression': (
#         LogisticRegression(max_iter=1000),
#         {'classifier__C': [0.01, 0.1, 1, 10]}
#     ),
#     'RandomForest': (
#         RandomForestClassifier(random_state=42),
#         {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [5, 10, None]}
#     )
# }

# outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nested_results

In [None]:
!pip install -q gradio

import gradio as gr
import pandas as pd
import numpy as np

# Assuming 'nested_results', 'preprocessor', and 'model_grids' are available from previous cells

# We need to retrain the models on the full dataset for deployment in the Gradio app
# This is because cross-validation trains models on subsets of the data.
trained_models = {}
for label in y_targets:
    y = df[label]
    for name, (model, param_grid) in model_grids.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        # Train the best model found during GridSearchCV on the full data
        # For simplicity, we'll just use the model with default parameters here,
        # but in a real scenario, you would use the best parameters from GridSearchCV
        pipeline.fit(X, y)
        trained_models[f'{label}_{name}'] = pipeline


def predict_resistance(age, gender, species, rectal_cpe_pos, setting, acquisition, bsi_source, chf, ckd, tumor, diabetes, immunosuppressed):
    # Create a pandas DataFrame from the input data
    input_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Species': [species],
        'Rectal_CPE_Pos': [rectal_cpe_pos],
        'Setting': [setting],
        'Acquisition': [acquisition],
        'BSI_Source': [bsi_source],
        'CHF': [chf],
        'CKD': [ckd],
        'Tumor': [tumor],
        'Diabetes': [diabetes],
        'Immunosuppressed': [immunosuppressed]
    })

    predictions = {}
    for label in y_targets:
        for name, model in model_grids.items():
             # Use the trained model for prediction
            pred = trained_models[f'{label}_{name}'].predict(input_data)[0]
            predictions[f'{label}_{name}'] = "Resistant" if pred == 1 else "Sensitive"


    # Format the output
    output_string = "Resistance Predictions:\n"
    for label_name, prediction in predictions.items():
        output_string += f"- {label_name}: {prediction}\n"

    return output_string

# Get unique values for dropdowns from the original dataframe
gender_choices = df['Gender'].unique().tolist()
species_choices = df['Species'].unique().tolist()
setting_choices = df['Setting'].unique().tolist()
acquisition_choices = df['Acquisition'].unique().tolist()
bsi_source_choices = df['BSI_Source'].unique().tolist()

# Create the Gradio interface
interface = gr.Interface(
    fn=predict_resistance,
    inputs=[
        gr.Slider(minimum=18, maximum=90, step=1, label="Age"),
        gr.Dropdown(choices=gender_choices, label="Gender"),
        gr.Dropdown(choices=species_choices, label="Species"),
        gr.Radio(choices=[0, 1], label="Rectal CPE Pos"),
        gr.Dropdown(choices=setting_choices, label="Setting"),
        gr.Dropdown(choices=acquisition_choices, label="Acquisition"),
        gr.Dropdown(choices=bsi_source_choices, label="BSI Source"),
        gr.Radio(choices=[0, 1], label="CHF"),
        gr.Radio(choices=[0, 1], label="CKD"),
        gr.Radio(choices=[0, 1], label="Tumor"),
        gr.Radio(choices=[0, 1], label="Diabetes"),
        gr.Radio(choices=[0, 1], label="Immunosuppressed")
    ],
    outputs="text",
    title="Antibiotic Resistance Prediction",
    description="Enter patient data to predict antibiotic resistance."
)

# Launch the interface
interface.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3dae9ad9f7bf1dda8b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
