In [None]:
import torch
import pandas as pd
from torch import nn, optim
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_name = "seyonec/ChemBERTa-zinc-base-v1"
#model_name = "entropy/gpt2_zinc_87m"
#model_name = "ncfrey/ChemGPT-4.7M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [None]:
class Discriminator(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256):
        super(Discriminator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()  # Outputs probability
        )

    def forward(self, x):
        # x: Batch of tokenized SMILES strings
        emb = self.embedding(x)  # Convert tokens to embeddings
        _, (hidden, _) = self.lstm(emb)  # Use the final hidden state
        output = self.fc(hidden[-1])  # Pass through FC layers
        return output

In [None]:
discriminator = Discriminator(vocab_size=len(tokenizer)) # Discriminator
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.01) # Dicriminator Optimizer
g_optimizer = optim.Adam(generator.parameters(), lr=0.1) # Generator Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss

# Load real data
real_data = pd.read_csv('Processed_Anti_Inflammatory_Compounds.csv')
real_data = real_data[real_data['anti_inflammatory'] == 1]
real_smiles = real_data['isosmiles'].tolist()

# Tokenize real SMILES
real_tokens = tokenizer(real_smiles, return_tensors="pt",
                        padding=True, truncation=True, max_length=100)['input_ids']


In [None]:
import time

# Training loop
epochs = 50
for epoch in range(epochs):
    start_time = time.time()
    temperature = max(1.0 - 0.02 * epoch, 0.5)  # Gradually reduce temperature

    # 1. Generate SMILES with the generator
    inputs = torch.tensor([[tokenizer.bos_token_id]])  # Start token
    num_fake_samples = min(16, real_tokens.size(0))
    generated = generator.generate(
          inputs,
          do_sample=True,
          max_length=100,
          temperature=0.7,
          pad_token_id=tokenizer.pad_token_id,
          num_return_sequences=num_fake_samples
          )
    fake_smiles = tokenizer.batch_decode(generated, skip_special_tokens=True)

    # Tokenize fake SMILES
    fake_tokens = tokenizer(fake_smiles, return_tensors="pt",
                            padding=True, truncation=True, max_length=100)['input_ids']

    # Labels
    real_labels = torch.ones(real_tokens.size(0), 1)
    fake_labels = torch.zeros(fake_tokens.size(0), 1)

    # 2. Train Discriminator
    discriminator.train()
    d_optimizer.zero_grad() # Reset the gradients to zero

    # Forward pass for real and fake data
    real_preds = discriminator(real_tokens)
    fake_preds = discriminator(fake_tokens.detach())

    # Compute discriminator loss
    d_loss_real = criterion(real_preds, real_labels)
    d_loss_fake = criterion(fake_preds, fake_labels)
    d_loss = d_loss_real + d_loss_fake

    d_loss.backward()
    d_optimizer.step()

    # 3. Train Generator
    generator.train()
    g_optimizer.zero_grad()

    # Generator tries to fool the discriminator
    fake_preds = discriminator(fake_tokens)
    g_loss = criterion(
        fake_preds, real_labels[:fake_preds.size(0)])  # Match shapes

    g_loss.backward()
    g_optimizer.step()
    end_time = time.time()
    epoch_duration = end_time - start_time
    # 4. Log Progress
    print(
        f"Epoch {epoch+1}/{epochs}, Temperature: {temperature:.2f}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}, Time: {epoch_duration:.2f} seconds"
    )

Epoch 1/50, Temperature: 1.00, D Loss: 1.3306, G Loss: 0.9290, Time: 142.92 seconds
Epoch 2/50, Temperature: 0.98, D Loss: 1.3901, G Loss: 0.6795, Time: 149.79 seconds
Epoch 3/50, Temperature: 0.96, D Loss: 1.3432, G Loss: 0.8210, Time: 149.26 seconds
Epoch 4/50, Temperature: 0.94, D Loss: 1.2352, G Loss: 1.1676, Time: 151.39 seconds
Epoch 5/50, Temperature: 0.92, D Loss: 1.2601, G Loss: 1.1601, Time: 150.80 seconds
Epoch 6/50, Temperature: 0.90, D Loss: 1.1939, G Loss: 1.4228, Time: 148.55 seconds
Epoch 7/50, Temperature: 0.88, D Loss: 1.0478, G Loss: 1.8436, Time: 150.10 seconds
Epoch 8/50, Temperature: 0.86, D Loss: 0.9811, G Loss: 2.1851, Time: 149.53 seconds
Epoch 9/50, Temperature: 0.84, D Loss: 0.8500, G Loss: 2.1959, Time: 147.01 seconds
Epoch 10/50, Temperature: 0.82, D Loss: 0.6477, G Loss: 2.2748, Time: 145.63 seconds
Epoch 11/50, Temperature: 0.80, D Loss: 0.7431, G Loss: 1.9815, Time: 153.44 seconds
Epoch 12/50, Temperature: 0.78, D Loss: 0.3850, G Loss: 3.1622, Time: 147.

In [None]:
generator.eval()

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(767, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [None]:
# Paramètres
num_fake_samples = 5  # Nombre d'exemples à générer
max_length = 100  # Longueur maximale des séquences générées
temperature = 0.7  # Température à utiliser pendant la génération (contrôle la diversité)

# 1. Générer avec le générateur
inputs = torch.tensor([[tokenizer.bos_token_id]])  # Le token de début (Bos token)
generated = generator.generate(
    inputs,
    do_sample=True,  # Utilisation de l'échantillonnage pour la génération
    max_length=max_length,  # Longueur maximale des séquences
    temperature=temperature,  # Température pour l'échantillonnage
    pad_token_id=tokenizer.pad_token_id,  # Id du token de padding
    num_return_sequences=num_fake_samples  # Nombre de séquences à générer
)

# 2. Décoder les tokens générés en chaînes de texte SMILES
fake_smiles = tokenizer.batch_decode(generated, skip_special_tokens=True)

# 3. Afficher les SMILES générés
for i, smile in enumerate(fake_smiles):
    print(f"Generated SMILES {i+1}: {smile}")


Generated SMILES 1: CCC CC[[NHNH222cccccccccc22)))(CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC))))))))CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Generated SMILES 2: 3cc cc3cc22ncncncnc333333333cccccccsc33nn22--22cccccccccc1111111111-ncnccccc((((CC)))))ccc+]+]+]cccccccccccccccccccccccccccccc
Generated SMILES 3: ][CCCc Nc==CC11==NCNC===NCNCNCNCNCNCNC44))))))SCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Generated SMILES 4: )[() CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC))))))))))CCCCCCCCCCCCCCCCCCCCCCCCC
Generated SMILES 5: 33c cc11ccccccc[OCcOCc+]+]22cccccccccc222)))))))cc11cnncnn22ccccc33cccccccccc))))-22233333cc((CCCccccccccccccccccccccccccccccccccc


In [None]:
# Etape 1: Tokenization des SMILES générés
# Vous avez déjà généré vos SMILES avec `generator.generate(...)`

# Tokenisation des SMILES générés pour les envoyer au discriminateur
fake_tokens = tokenizer(fake_smiles, return_tensors="pt", padding=True, truncation=True, max_length=100)['input_ids']

# Etape 2: Passer les tokens générés dans le discriminateur pour obtenir des prédictions
discriminator.eval()  # Passer le discriminateur en mode évaluation pour éviter le dropout
with torch.no_grad():  # Pas de calcul de gradient pour la prédiction
    fake_preds = discriminator(fake_tokens)

X_result = pd.DataFrame({
    'isosmiles': fake_smiles
})
# Etape 3: Interpréter les résultats
# Les résultats sont des probabilités entre 0 et 1
for i, pred in enumerate(fake_preds):
    print(f"SMILES {i+1}: {fake_smiles[i]}")
    print(f"Prediction (Real/Fake probability): {pred.item():.4f}")


SMILES 1: CCC CC[[NHNH222cccccccccc22)))(CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC))))))))CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Prediction (Real/Fake probability): 0.0002
SMILES 2: 3cc cc3cc22ncncncnc333333333cccccccsc33nn22--22cccccccccc1111111111-ncnccccc((((CC)))))ccc+]+]+]cccccccccccccccccccccccccccccc
Prediction (Real/Fake probability): 0.0000
SMILES 3: ][CCCc Nc==CC11==NCNC===NCNCNCNCNCNCNC44))))))SCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
Prediction (Real/Fake probability): 0.0021
SMILES 4: )[() CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC))))))))))CCCCCCCCCCCCCCCCCCCCCCCCC
Prediction (Real/Fake probability): 0.0012
SMILES 5: 33c cc11ccccccc[OCcOCc+]+]22cccccccccc222)))))))cc11cnncnn22ccccc33cccccccccc))))-22233333cc((CCCccccccccccccccccccccccccccccccccc
Prediction (Real/Fake probability): 0.0000


In [None]:
!pip install rdkit

from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
from rdkit import Chem


# Function to calculate molecular descriptors from SMILES

def calculate_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None  # If the SMILES string is invalid

    descriptors = {}

    # Molecular weight (mw)
    descriptors["mw"] = Descriptors.MolWt(mol)

    # LogP (hydrophobicity, xlogp)
    descriptors["xlogp"] = Crippen.MolLogP(mol)

    # Polar surface area (polararea)
    descriptors["polararea"] = rdMolDescriptors.CalcTPSA(mol)

    # Number of rotatable bonds (rotbonds)
    descriptors["rotbonds"] = Descriptors.NumRotatableBonds(mol)

    # Hydrogen bond donors (hbonddonor)
    descriptors["hbonddonor"] = Descriptors.NumHDonors(mol)

    # Hydrogen bond acceptors (hbondacc)
    descriptors["hbondacc"] = Descriptors.NumHAcceptors(mol)

    return descriptors

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [None]:
def extract_descriptors(row):
    descriptors = calculate_molecular_descriptors(row['isosmiles'])
    if descriptors is not None:
        return pd.Series(descriptors)
    else:
        # Retourner None si la molécule est invalide
        return pd.Series({
            "mw": None,
            "xlogp": None,
            "polararea": None,
            "rotbonds": None,
            "hbonddonor": None,
            "hbondacc": None,
        })

# Appliquer la fonction à chaque ligne et concaténer les résultats avec X_result
descriptors_df = X_result.apply(extract_descriptors, axis=1)
X_result = pd.concat([X_result, descriptors_df], axis=1)

# Afficher les premières lignes pour vérification
X_result.head()

[19:10:26] SMILES Parse Error: syntax error while parsing: 3cc
[19:10:26] SMILES Parse Error: Failed parsing SMILES '3cc' for input: '3cc'
[19:10:26] SMILES Parse Error: syntax error while parsing: ][CCCc
[19:10:26] SMILES Parse Error: Failed parsing SMILES '][CCCc' for input: '][CCCc'
[19:10:26] SMILES Parse Error: syntax error while parsing: )[()
[19:10:26] SMILES Parse Error: Failed parsing SMILES ')[()' for input: ')[()'
[19:10:26] SMILES Parse Error: syntax error while parsing: 33c
[19:10:26] SMILES Parse Error: Failed parsing SMILES '33c' for input: '33c'


Unnamed: 0,isosmiles,mw,xlogp,polararea,rotbonds,hbonddonor,hbondacc
0,CCC CC[[NHNH222cccccccccc22)))(CCCCCCCCCCCCCCC...,44.097,1.4163,0.0,0.0,0.0,0.0
1,3cc cc3cc22ncncncnc333333333cccccccsc33nn22--2...,,,,,,
2,][CCCc Nc==CC11==NCNC===NCNCNCNCNCNCNC44))))))...,,,,,,
3,)[() CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,
4,33c cc11ccccccc[OCcOCc+]+]22cccccccccc222)))))...,,,,,,


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import joblib


# Step 1: Load the dataset
data = pd.read_csv("Processed_Anti_Inflammatory_Compounds.csv")

# Step 2: Separate features and target
X = data[["mw", "xlogp", "polararea", "rotbonds",
          "hbonddonor", "hbondacc"]]
y = data["anti_inflammatory"]

# Step 3: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Step 5a: Train Random Forest Classifier
# Train the optimized Random Forest model
optimized_rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42
)

optimized_rf_model.fit(X_train, y_train)


In [None]:
X_test = X_result[["mw", "xlogp", "polararea", "rotbonds",
          "hbonddonor", "hbondacc"]]
probs = optimized_rf_model.predict_proba(X_test)[:, 1]
predictions = (probs >= 0.6).astype(int)
predictions



array([0, 1, 1, 1, 1])

In [None]:
import torch
import numpy as np
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols
from sklearn.metrics import pairwise_distances
import pandas as pd

# Fonction pour calculer la validité
def calculate_validity(molecules):
    valid_count = 0
    for mol in molecules:
        if Chem.MolFromSmiles(mol) is not None:
            valid_count += 1
    return valid_count / len(molecules)

# Fonction pour calculer l'originalité
def calculate_originality(generated_molecules, original_molecules):
    unique_molecules = set(generated_molecules) - set(original_molecules)
    return len(unique_molecules) / len(generated_molecules)

# Fonction pour calculer la diversité
def calculate_diversity(molecules):
    fps = [FingerprintMols.FingerprintMol(Chem.MolFromSmiles(mol)) for mol in molecules if Chem.MolFromSmiles(mol) is not None]
    pairwise_similarities = [TanimotoSimilarity(fps[i], fps[j]) for i in range(len(fps)) for j in range(i + 1, len(fps))]
    return 1 - np.mean(pairwise_similarities)

# Fonction pour calculer le drug-likeliness (via QED)
def calculate_drug_likeliness(molecules):
    druglike_count = 0
    for mol in molecules:
        mol_obj = Chem.MolFromSmiles(mol)
        if mol_obj is not None:
            qed_score = QED.qed(mol_obj)
            if qed_score > 0.5:  # Seuil arbitraire pour un drug-likeness raisonnable
                druglike_count += 1
    return druglike_count / len(molecules)


inputs = torch.tensor([[tokenizer.bos_token_id]])  # Le token de début (Bos token)
generated = generator.generate(
    inputs,
    do_sample=True,  # Utilisation de l'échantillonnage pour la génération
    max_length=max_length,  # Longueur maximale des séquences
    temperature=temperature,  # Température pour l'échantillonnage
    pad_token_id=tokenizer.pad_token_id,  # Id du token de padding
    num_return_sequences=num_fake_samples  # Nombre de séquences à générer
)

# 2. Décoder les tokens générés en chaînes de texte SMILES
fake_smiles = tokenizer.batch_decode(generated, skip_special_tokens=True)

# Calculer les métriques
validity = calculate_validity(generated)
diversity = calculate_diversity(generated)
drug_likeliness = calculate_drug_likeliness(generated)

# Afficher les résultats
print(f"Validité: {validity:.2f}")
print(f"Diversité: {diversity:.2f}")
print(f"Drug-likeliness: {drug_likeliness:.2f}")


TypeError: No registered converter was able to produce a C++ rvalue of type std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > from this Python object of type Tensor

In [None]:
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem

# Helper function: Check validity of SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# Helper function: Calculate Tanimoto similarity
def calculate_tanimoto(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    if mol1 is None or mol2 is None:
        return 0.0
    fp1 = FingerprintMols.FingerprintMol(mol1)
    fp2 = FingerprintMols.FingerprintMol(mol2)
    return TanimotoSimilarity(fp1, fp2)

# Helper function: Filter molecules by Lipinski's rule of 5
def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    mw = AllChem.CalcExactMolWt(mol)
    hbd = Chem.Lipinski.NumHDonors(mol)
    hba = Chem.Lipinski.NumHAcceptors(mol)
    logp = Chem.Crippen.MolLogP(mol)
    return mw <= 500 and hbd <= 5 and hba <= 10 and logp <= 5

# Optimize generated molecules
def optimize_synthetic_data(synthetic_smiles, existing_smiles):
    optimized_smiles = []
    diversity_threshold = 0.7  # Minimum Tanimoto similarity to consider a molecule unique

    for smiles in synthetic_smiles:
        if is_valid_smiles(smiles) and passes_lipinski(smiles):
            unique = True
            for existing in existing_smiles:
                similarity = calculate_tanimoto(smiles, existing)
                if similarity >= diversity_threshold:
                    unique = False
                    break
            if unique:
                optimized_smiles.append(smiles)
    return optimized_smiles

# Step 1: Filter and optimize generated molecules
synthetic_compounds = ["C1=CC=C(C=C1)C(=O)O", "CCO", "invalid_smiles", "CCN(CC)CCO"]
existing_compounds = list(compounds)  # Original dataset

optimized_compounds = optimize_synthetic_data(synthetic_compounds, existing_compounds)

print(f"Original synthetic compounds: {len(synthetic_compounds)}")
print(f"Optimized compounds: {len(optimized_compounds)}")

# Step 2: Predicted activity
# Use a pre-trained model to predict the properties of the optimized molecules
# Here, an example placeholder for predictions
def predict_activity(smiles_list, chemberta_model, tokenizer):
    predictions = []
    for smiles in smiles_list:
        inputs = tokenizer(smiles, max_length=MAX_LEN, truncation=True, padding="max_length", return_tensors="pt")
        with torch.no_grad():
            outputs = chemberta_model(**inputs)
        predictions.append(outputs.logits.item())
    return predictions

activity_scores = predict_activity(optimized_compounds, chemberta_model, tokenizer)

# Select molecules with high predicted activity
threshold = 0.8  # Example threshold for predicted activity
final_compounds = [smiles for smiles, score in zip(optimized_compounds, activity_scores) if score >= threshold]

print(f"Final compounds after activity prediction: {len(final_compounds)}")