In [4]:
!pip install biopython xgboost umap-learn matplotlib seaborn scikit-learn

Collecting biopython
  Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m109.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn, biopython
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-l

In [5]:
!pip install fair-esm biopython transformers

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


In [6]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO

import torch
import esm

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.express as px


In [7]:
import pandas as pd

metadata_path = "/kaggle/input/card-data/aro_index.tsv"

df = pd.read_csv(metadata_path, sep="\t")
print(df.columns)

Index(['ARO Accession', 'CVTERM ID', 'Model Sequence ID', 'Model ID',
       'Model Name', 'ARO Name', 'Protein Accession', 'DNA Accession',
       'AMR Gene Family', 'Drug Class', 'Resistance Mechanism',
       'CARD Short Name'],
      dtype='object')


In [8]:
metadata_path = "/kaggle/input/card-data/aro_index.tsv"

df_meta = pd.read_csv(metadata_path, sep="\t", usecols=[
    "ARO Accession",
    "ARO Name",
    "Protein Accession",
    "AMR Gene Family",
    "Drug Class",
    "Resistance Mechanism"
])

# Convert ARO Accession into pure numeric ID (extract digits)
df_meta["ARO Accession"] = (
    df_meta["ARO Accession"]
    .astype(str)
    .str.extract(r"(\d+)")
)

df_meta.dropna(subset=["ARO Accession"], inplace=True)
df_meta["ARO Accession"] = df_meta["ARO Accession"].astype(str)


In [9]:
fasta_path = "/kaggle/input/card-data/protein_fasta_protein_homolog_model.fasta"

records = list(SeqIO.parse(fasta_path, "fasta"))

import re

seqs = []

for r in records:
    # Extract ARO numeric ID from header
    match = re.search(r"ARO:(\d+)", r.description)
    if match:
        aro = match.group(1)
        seqs.append({"ARO Accession": aro, "sequence": str(r.seq)})

df_seq = pd.DataFrame(seqs)

df = df_meta.merge(df_seq, on="ARO Accession", how="inner")
df.head()
print(df.shape)



(6054, 7)


In [10]:
from Bio import SeqIO

fasta_path = "/kaggle/input/card-data/protein_fasta_protein_homolog_model.fasta"

for r in SeqIO.parse(fasta_path, "fasta"):
    print(r.id)
    print(r.description)
    break


gb|ACT97415.1|ARO:3002999|CblA-1
gb|ACT97415.1|ARO:3002999|CblA-1 [mixed culture bacterium AX_gF3SD01_15]


In [11]:
print(df_meta["ARO Accession"].head())
print(df_meta["ARO Accession"].dtype)


0    3005099
1    3002523
2    3002524
3    3002525
4    3002526
Name: ARO Accession, dtype: object
object


In [12]:
import re

records = list(SeqIO.parse(fasta_path, "fasta"))

seqs = []

for r in records:
    desc = r.description

    # Try extracting different possible ARO accession formats
    aro_numeric = re.findall(r"ARO:?_?(\d+)", desc)
    aro_pipe = re.findall(r"(\d+)\|", desc)
    aro_before_space = r.id.split(".")[0]  # part before dot

    if aro_numeric:
        aro = aro_numeric[0]
    elif aro_pipe:
        aro = aro_pipe[0]
    else:
        aro = aro_before_space

    seqs.append({"ARO Accession": aro, "sequence": str(r.seq)})

df_seq = pd.DataFrame(seqs)

print(df_seq.head())
print(df_seq.shape)


  ARO Accession                                           sequence
0       3002999  MKAYFIAILTLFTCIATVVRAQQMSELENRIDSLLNGKKATVGIAV...
1       3001109  MRYIRLCIISLLAALPLAVHASPQPLEQIKQSESQLSGRVGMIEMD...
2       3002867  MIGLIVARSKNNVIGKNGNIPWKIKGEQKQFRELTTGNVVIMGRKS...
3       3001989  MVTKRVQRMMFAAAACIPLLLGSAPLYAQTSAVQQKLAALEKSSGG...
4       3002356  MELPNIMHPVAKLSTALAAALMLSGCMPGEIRPTIGQQMETGDQRF...
(6052, 2)


In [13]:
df = df_meta.merge(df_seq, on="ARO Accession", how="inner")

print(df.shape)
df.head()


(6053, 7)


Unnamed: 0,ARO Accession,ARO Name,Protein Accession,AMR Gene Family,Drug Class,Resistance Mechanism,sequence
0,3005099,23S rRNA (adenine(2058)-N(6))-methyltransferas...,AAB60941.1,Erm 23S ribosomal RNA methyltransferase,lincosamide antibiotic;macrolide antibiotic;st...,antibiotic target alteration,MKQKNPKNTQNFITSKKHVKEILKYTNINKQDKIIEIGSGKGHFTK...
1,3002523,AAC(2')-Ia,AAA03550.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MGIEYRSLHTSQLTLSEKEALYDLLIEGFEGDFSHDDFAHTLGGMH...
2,3002524,AAC(2')-Ib,AAC44793.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MPFQDVSAPVRGGILHTARLVHTSDLDQETREGARRMVIEAFEGDF...
3,3002525,AAC(2')-Ic,CCP42991.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MHTQVHTARLVHTADLDSETRQDIRQMVTGAFAGDFTETDWEHTLG...
4,3002526,AAC(2')-Id,AAB41701.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MLTQHVSEARTRGAIHTARLIHTSDLDQETRDGARRMVIEAFRDPS...


In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model = model.to(device)
batch_converter = alphabet.get_batch_converter()

embeddings = []

BATCH_SIZE = 16  # adjust based on GPU memory

sequence_list = list(df["sequence"])

for i in range(0, len(sequence_list), BATCH_SIZE):
    batch_seqs = sequence_list[i : i + BATCH_SIZE]
    batch = [("protein", seq) for seq in batch_seqs]

    labels, strs, tokens = batch_converter(batch)
    tokens = tokens.to(device)

    with torch.no_grad():
        results = model(tokens, repr_layers=[33])

    reps = results["representations"][33].cpu().numpy()
    reps_mean = reps.mean(axis=1)  # mean pool

    embeddings.extend(reps_mean)

df["embedding"] = embeddings
df.to_pickle("card_embeddings.pkl")

print("✅ Saved embeddings to card_embeddings.pkl")


✅ Saved embeddings to card_embeddings.pkl


In [29]:
df.head()


Unnamed: 0,ARO Accession,ARO Name,Protein Accession,AMR Gene Family,Drug Class,Resistance Mechanism,sequence,embedding
0,3005099,23S rRNA (adenine(2058)-N(6))-methyltransferas...,AAB60941.1,Erm 23S ribosomal RNA methyltransferase,lincosamide antibiotic;macrolide antibiotic;st...,antibiotic target alteration,MKQKNPKNTQNFITSKKHVKEILKYTNINKQDKIIEIGSGKGHFTK...,"[-0.0021094177, -0.03240475, -0.0039766748, 0...."
1,3002523,AAC(2')-Ia,AAA03550.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MGIEYRSLHTSQLTLSEKEALYDLLIEGFEGDFSHDDFAHTLGGMH...,"[0.0049595386, -0.07888554, 0.09245716, 0.0065..."
2,3002524,AAC(2')-Ib,AAC44793.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MPFQDVSAPVRGGILHTARLVHTSDLDQETREGARRMVIEAFEGDF...,"[0.016762396, -0.048971687, 0.057476453, -0.02..."
3,3002525,AAC(2')-Ic,CCP42991.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MHTQVHTARLVHTADLDSETRQDIRQMVTGAFAGDFTETDWEHTLG...,"[-0.016126482, -0.06911608, 0.04928137, -0.044..."
4,3002526,AAC(2')-Id,AAB41701.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,MLTQHVSEARTRGAIHTARLIHTSDLDQETRDGARRMVIEAFRDPS...,"[-0.0071755834, -0.011755409, 0.05042654, -0.0..."


In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
df["cluster"] = kmeans.fit_predict(embed_matrix)

In [None]:
pca = PCA(2)
embed_2d = pca.fit_transform(embed_matrix)

df["PC1"] = embed_2d[:,0]
df["PC2"] = embed_2d[:,1]

px.scatter(
    df, x="PC1", y="PC2",
    color="cluster",
    hover_data=["ARO Name", "Drug Class"]
)

In [None]:
df["target"] = LabelEncoder().fit_transform(df["Model Type"])

X_train, X_test, y_train, y_test = train_test_split(
    embed_matrix, df["target"], test_size=0.2, random_state=42
)

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

pred = model_rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))


In [35]:
df.to_csv("ARG_mobility_results.csv", index=False)
print("✅ Saved all results.")

✅ Saved all results.
