# Transformers on SMILES

In [13]:
from transformers import pipeline
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch.utils.data import Dataset
from transformers import RobertaTokenizer

from transformers import RobertaForSequenceClassification

from transformers import TrainingArguments, Trainer

from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("data/train.csv")
smiles = data["smiles"]
y = data["class"]

### Split data into training and testing sets

In [None]:
# # Split 80% train and 20% test
# train_smiles, test_smiles, train_labels, test_labels = train_test_split(smiles, y, test_size=0.2, random_state=42)

train80 = pd.read_csv("train_data_80.csv")
train2O = pd.read_csv("train_data_20.csv")

train_smiles = train80["smiles"]
train_labels = train80["class"]
test_smiles = train2O["smiles"]
test_labels = train2O["class"]

### Convert to a dictionary format for Hugging Face's Trainer

In [None]:
# train_data = {"smiles": train_smiles.tolist(), "labels": train_labels.tolist()}
# test_data = {"smiles": test_smiles.tolist(), "labels": test_labels.tolist()}

### Fct° compute metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    kappa = cohen_kappa_score(labels, preds)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "cohen_kappa": kappa,
    }

### SMILESDateset Class and Tokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")


class SMILESDataset(Dataset):
    def __init__(self, smiles_list, labels, tokenizer):
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.smiles_list = smiles_list.reset_index(drop=True)

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        label = self.labels[idx]  
        inputs = self.tokenizer(smiles, max_length=64, truncation=True, padding="max_length", return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Remove batch dimension
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        return inputs


In [None]:
train_dataset = SMILESDataset(train_smiles, train_labels, tokenizer)
test_dataset = SMILESDataset(test_smiles, test_labels, tokenizer)

### Load pre-trained ChemBERTa and configure it for classification

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    "seyonec/ChemBERTa-zinc-base-v1",
    num_labels=2
)

### Set up the trainer

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.005,
    logging_dir="./logs",
    logging_steps=50,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train the model

In [None]:
trainer.train()

## Evaluate and calculate metrics

### Calculate metrics


In [None]:
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

print(classification_report(test_labels, predicted_labels))
print("Cohen's Kappa Score:", cohen_kappa_score(test_labels, predicted_labels))


### Predictions on train.csv


In [None]:
import numpy as np

proba_toxicite = predictions.predictions
if proba_toxicite.shape[1] == 2:
    # Softmax pour obtenir les probabilités
    proba_toxicite = torch.softmax(torch.tensor(proba_toxicite), dim=1).numpy()[:, 1]
else:
    # Si déjà des probabilités, prendre la colonne 1
    proba_toxicite = proba_toxicite[:, 1]

df_proba = pd.DataFrame({
    "smiles": test_smiles.values,
    "proba_toxicite": proba_toxicite
})

df_proba.to_csv("smiles_proba.csv", index=False)

In [None]:
assert all(df_proba["smiles"].values == test_smiles.values)

### Prediction on test_1.csv 

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score

# === 1. Définir le dataset pour les SMILES ===
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, labels, tokenizer):
        self.smiles_list = smiles_list.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(smiles, max_length=64, truncation=True, padding="max_length", return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        return inputs

# === 2. Charger le tokenizer et le modèle entraîné ===
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaForSequenceClassification.from_pretrained("./results/checkpoint-590")


# === 3. Préparer le fichier test_1.csv ===
new_data = pd.read_csv("data/test_1.csv")
new_smiles = new_data["smiles"]

# Créer un dataset avec des labels factices (nécessaires pour l'objet Dataset)
dummy_labels = pd.Series([0] * len(new_smiles))
new_dataset = SMILESDataset(new_smiles, dummy_labels, tokenizer)

# === 4. Configurer le Trainer pour l’inférence uniquement ===
training_args = TrainingArguments(
    output_dir="./results",  # Chemin arbitraire
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    do_train=False,  # Pas d'entraînement
    do_eval=False
)

trainer = Trainer(
    model=model,
    args=training_args,
)

# === 5. Prédictions ===
predictions = trainer.predict(new_dataset)
logits = predictions.predictions

# Convertir les logits en probabilités
if logits.shape[1] == 2:
    proba_toxicite = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]
else:
    proba_toxicite = logits[:, 1]

# === 6. Exporter les résultats ===
df_resultats = pd.DataFrame({
    "smiles": new_smiles,
    "proba_toxicite": proba_toxicite
})

df_resultats.to_csv("test_1_predictions.csv", index=False)
print("✅ Prédictions sauvegardées dans test_1_predictions.csv")




✅ Prédictions sauvegardées dans test_1_predictions.csv
