# **Code1**

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

# Define your SMILESDataset class
class SMILESDataset(Dataset):
    def __init__(self, data_file):
        self.data = pd.read_csv(data_file, header=None, names=['smiles'])
        self.data['mol'] = self.data['smiles'].apply(Chem.MolFromSmiles)
        self.data[['logp', 'rotb', 'molwt', 'qed', 'hba', 'hbd']] = self.data['mol'].apply(compute_descriptors)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        smiles = self.data.iloc[idx]['smiles']
        features = self.data.iloc[idx][['logp', 'rotb', 'molwt', 'qed', 'hba', 'hbd']].values.astype(np.float32)
        return smiles, features

# Compute molecular descriptors
def compute_descriptors(mol):
    if mol is None:
        # Return default values if molecule is None
        return pd.Series([0, 0, 0, 0, 0, 0])
    else:
        molwt = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        hbd = rdMolDescriptors.CalcNumHBD(mol)
        hba = rdMolDescriptors.CalcNumHBA(mol)
        rotb = Descriptors.NumRotatableBonds(mol)
        return pd.Series([logp, rotb, molwt, Descriptors.qed(mol), hba, hbd])



dataset = SMILESDataset('train.smi')


[12:09:03] SMILES Parse Error: extra open parentheses for input: 'Cn1c(N(Cc2ccc(C(=O)N=c3nn[nH][nH]3)cc2)C2CCC(C(C)(C)C)CC2)nc2cc(O'


In [None]:
len(dataset)

63533

In [None]:

# Define the char_to_index dictionary
char_to_index = {
    'C': 0,
    'O': 1,
    'N': 2,
    'H': 3,
    '(': 4,
    ')': 5,
    '=': 6,
    '#': 7,
    '1': 8,
    '2': 9,
    '3': 10,
    '4': 11,
    '5': 12,
    '6': 13,
    '7': 14,
    '8': 15,
    '9': 16,
    '0': 17,
}

# Create index_to_char dictionary by reversing char_to_index
index_to_char = {v: k for k, v in char_to_index.items()}

In [None]:
import numpy as np

def preprocess_smiles(smiles, max_length=100):
    # Initialize an array for the padded numerical representation
    padded_representation = np.full(max_length, -1)  # Use '_' for padding

    # Tokenize SMILES string and convert to numerical representation
    numerical_representation = [char_to_index.get(c, -1) for c in smiles]

    # Truncate or pad numerical representation to the fixed length
    num_tokens = min(len(numerical_representation), max_length)
    padded_representation[:num_tokens] = numerical_representation[:num_tokens]

    return padded_representation


def preprocess_features(features):
    # Normalize features if necessary
    # Convert features to tensors
    features_list = [f for f in features]

    # Normalize features
    features_array = np.array(features_list)
    normalized_features = (features_array - np.mean(features_array)) / np.std(features_array)

    # Convert features to tensors
    features_tensor = torch.tensor(normalized_features, dtype=torch.float32)

    return features_tensor

class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x):
        # Reshape input tensor to have the sequence length dimension
        x = x.unsqueeze(1)
        batch_size = x.size(0)
        init_hidden = (torch.zeros(1, batch_size, self.hidden_size),
                       torch.zeros(1, batch_size, self.hidden_size))
        out, _ = self.rnn(x, init_hidden)
        out = self.fc(out[:, -1, :])  # Select the last output in the sequence
        #print(out.shape)
        return out


# Define Discriminator architecture
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        #print(x.shape)
        x_tensor = torch.tensor(x, dtype=torch.float32)
        #print(x_tensor.shape)
        out = torch.relu(self.fc1(x_tensor))
        #print(1)
        out = torch.sigmoid(self.fc2(out))
        return out

# Training loop
def train(generator, discriminator, train_loader, criterion, g_optimizer, d_optimizer, num_epochs):
    for epoch in range(num_epochs):
        for smiles, features in train_loader:
            # Preprocess SMILES and features
            processed_features = preprocess_features(features)
            processed_smiles = preprocess_smiles(smiles)

            # Train Discriminator
            d_optimizer.zero_grad()
            real_output = discriminator(processed_smiles)
            #print(0)
            fake_output = discriminator(generator(processed_features))
            d_loss = criterion(real_output, torch.ones_like(real_output)) + criterion(fake_output, torch.zeros_like(fake_output))
            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            fake_output = discriminator(generator(processed_features))
            g_loss = criterion(fake_output, torch.ones_like(fake_output))
            g_loss.backward()
            g_optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], G_Loss: {g_loss.item()}, D_Loss: {d_loss.item()}')

# Set hyperparameters
input_size = 6  # Number of molecular descriptors
hidden_size = 128
output_size = 100  # Size of vocabulary for SMILES tokens
num_epochs = 10
learning_rate = 0.001

# Load your dataset

train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Initialize Generator and Discriminator
generator = Generator(input_size=6, hidden_size=128, output_size=100)
discriminator = Discriminator(input_size=100, hidden_size=128, output_size=1)  # Assuming features have the same size

# Define loss function and optimizers
criterion = nn.BCELoss()
g_optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
d_optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Train the models
train(generator, discriminator, train_loader, criterion, g_optimizer, d_optimizer, num_epochs)


In [None]:
# Define a function to convert numerical tokens to SMILES
def convert_to_smiles(output_tensor, char_to_index):
    # Convert tensor to numpy array
    output_array = output_tensor.squeeze().detach().numpy()
    # Map numerical tokens to characters in SMILES vocabulary
    smiles_chars = [char for token in output_array for char, index in char_to_index.items() if index == int(token)]
    # Join characters to form SMILES string
    smiles_string = ''.join(smiles_chars)
    return smiles_string

# Test the generator
def test_generator(generator, feature_vector, char_to_index):
    # Convert feature vector to tensor
    feature_tensor = torch.tensor(feature_vector, dtype=torch.float32)
    # Generate SMILES tokens
    generated_tokens = generator(feature_tensor.unsqueeze(0))*100
    print(generated_tokens)
    # Convert tokens to SMILES string
    generated_smiles = convert_to_smiles(generated_tokens, char_to_index)
    return generated_smiles

# Example feature vector
example_feature = [5.5633998e+00, 6.0000000e+00, 6.0275702e+02, 3.0880994e-01,
        8.0000000e+00, 1.0000000e+00]
example_feature = preprocess_features(example_feature)


# Test the generator
generated_smiles = test_generator(generator, example_feature, char_to_index)
print("Generated SMILES:", generated_smiles)

tensor([[ -1.9386,   7.7949,   4.6882,   1.2334,  -7.6688,  -1.7499,  -7.9764,
          -0.4336,  -2.1415,   1.3494,   0.4190,   9.9071,  -6.3024,  -1.0888,
          -4.9484,  -2.6139,   0.5625,   5.8099,  -6.9510,  -6.1652,   0.7955,
           9.6862,   8.4107,  -0.6118,  -7.7706,   8.0619,   8.5937,  -8.2207,
          -5.7854,  10.1466,   8.5191,   2.0062,  10.8956,  -2.4045,   5.3824,
          -9.9102,   4.3019,   0.0130,  -5.5973,   8.7250,   4.9875,  -1.5318,
          -7.9905, -11.6901,  -1.7776,   1.3973,   8.2043,  -9.0098,   9.8728,
          -3.4986,   5.9645,  -1.9910,  -7.2768,   6.4773,  -2.8570,   0.6650,
          -5.4184,  -1.7903,  -7.9534,  -0.1328,  -6.2866,  -7.6002,  -0.2442,
           4.0442,   5.5332,  -1.7552,  -3.1745,  -2.6822,  -2.1126,   9.8187,
          -6.7511,  -2.6546,  -1.7342,  -4.7961,   1.0328,   0.4165,  -4.3086,
           7.3225,   6.2127,  10.1198,  -0.7599,  -2.1322,   2.4347,  -6.6218,
          -4.4297,  -6.2012,   8.3669,  -0.7065,  -2

  feature_tensor = torch.tensor(feature_vector, dtype=torch.float32)


In [None]:
dataset[0]

('COc1ccc(CN2CC(C)C(OC)CN(C)C(=O)c3cc(NC(=O)c4nc5ccccc5s4)ccc3OCC2C)cc1',
 array([5.5633998e+00, 6.0000000e+00, 6.0275702e+02, 3.0880994e-01,
        8.0000000e+00, 1.0000000e+00], dtype=float32))

In [None]:
xy = preprocess_smiles("NCOCNCC=OCCO=OCCNCOCOOCNCN(COCNO(CNNOCCC)N)COONCCOCN(CC()CCOCOCCNCON#C")

In [None]:
xy

array([2., 0., 1., 0., 2., 0., 0., 5., 1., 0., 0., 1., 5., 1., 0., 0., 2.,
       0., 1., 0., 1., 1., 0., 2., 0., 2., 3., 0., 1., 0., 2., 1., 3., 0.,
       2., 2., 1., 0., 0., 0., 4., 2., 4., 0., 1., 1., 2., 0., 0., 1., 0.,
       2., 3., 0., 0., 3., 4., 0., 0., 1., 0., 1., 0., 0., 2., 0., 1., 2.,
       6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
len(xy)

100

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw

def visualize_molecule(smiles):
    # Convert SMILES string to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)

    # Check if the conversion was successful
    if mol is not None:
        # Generate a 2D depiction of the molecule
        img = Draw.MolToImage(mol)
        # Display the image
        img.show()
    else:
        print("Invalid SMILES string:", smiles)

# Example SMILES string
smiles = "(OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H"
# Visualize the molecule
visualize_molecule(smiles)

Invalid SMILES string: (OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H


[12:32:44] SMILES Parse Error: syntax error while parsing: (OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H
[12:32:44] SMILES Parse Error: Failed parsing SMILES '(OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H' for input: '(OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H'


In [None]:
from rdkit import Chem
from rdkit.Chem import Draw

def visualize_molecule(smiles, filename):
    # Convert SMILES string to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)

    # Check if the conversion was successful
    if mol is not None:
        # Generate a 2D depiction of the molecule
        img = Draw.MolToImage(mol)
        # Save the image to a file
        img.save(filename)
    else:
        print("Invalid SMILES string:", smiles)

# Example SMILES string
smiles = "COc1ccc(CN2CC(C)C(OC)CN(C)C(=O)c3cc(NC(=O)c4nc5ccccc5s4)ccc3OCC2C)cc1"
# Specify the filename to save the image
filename = "molecule.png"
# Visualize the molecule and save the image
visualize_molecule(smiles, filename)


In [None]:
from rdkit import Chem

def correct_smiles(smiles):
    # Correct mismatched parentheses
    num_open_parentheses = smiles.count('(')
    num_close_parentheses = smiles.count(')')
    if num_open_parentheses > num_close_parentheses:
        smiles += ')' * (num_open_parentheses - num_close_parentheses)
    elif num_open_parentheses < num_close_parentheses:
        smiles = '(' * (num_close_parentheses - num_open_parentheses) + smiles

    # Replace invalid bonds and correct other syntax errors
    invalid_bonds = ['=', '#', '/', '\\', ':', '.']
    for bond in invalid_bonds:
        smiles = smiles.replace(' ' + bond + ' ', bond)
    smiles = smiles.replace('C1C', 'C1')
    smiles = smiles.replace('1C', 'C1')

    # Handle additional syntax corrections based on common issues

    return smiles

# Example usage
incorrect_smiles = "(OCOC2C)C21C1131N3)(C1(O12)=CCC2OC=3CN1C1CO)H"
corrected_smiles = correct_smiles(incorrect_smiles)
print("Corrected SMILES:", corrected_smiles)

# Convert corrected SMILES to RDKit molecule object
mol = Chem.MolFromSmiles(corrected_smiles)
if mol is not None:
    # Generate a 2D depiction of the molecule
    Chem.Draw.MolToImage(mol).show()
else:
    print("Failed to parse corrected SMILES.")


Corrected SMILES: ((OCOC2C)C2C11131N3)(C1(O12)=CCC2OC=3CNC11O)H
Failed to parse corrected SMILES.


[12:38:15] SMILES Parse Error: syntax error while parsing: ((OCOC2C)C2C11131N3)(C1(O12)=CCC2OC=3CNC11O)H
[12:38:15] SMILES Parse Error: Failed parsing SMILES '((OCOC2C)C2C11131N3)(C1(O12)=CCC2OC=3CNC11O)H' for input: '((OCOC2C)C2C11131N3)(C1(O12)=CCC2OC=3CNC11O)H'


# **code2**

In [2]:
# Assuming 'train.smi' contains your dataset
with open('train.smi', 'r') as f:
    smiles_strings = f.readlines()

# Collect unique characters from all SMILES strings
smiles_vocab = set()
for smiles in smiles_strings:
    smiles_vocab.update(set(smiles.strip()))

# Print the size of the SMILES vocabulary
print("Size of SMILES vocabulary:", len(smiles_vocab))


Size of SMILES vocabulary: 32


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# Define your SMILESDataset class
class SMILESDataset(Dataset):
    def __init__(self, data_file):
        self.data = pd.read_csv(data_file, header=None, names=['smiles'])
        self.data['fingerprint'] = self.data['smiles'].apply(generate_morgan_fingerprint)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fingerprint = self.data.iloc[idx]['fingerprint']
        if fingerprint is None:
            # Handle invalid molecule
            return None
        return fingerprint

# Generate Morgan fingerprints
def generate_morgan_fingerprint(smiles, radius=2, num_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
    return np.array(fingerprint, dtype=np.float32)

dataset = SMILESDataset('train.smi')



RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x2048 and 32x256)

In [7]:
# Define the Generator architecture
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.fc = nn.Linear(self.input_size, self.hidden_size)
        self.rnn = nn.LSTM(self.hidden_size, self.hidden_size, num_layers=2, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        x = x.unsqueeze(1)
        batch_size = x.size(0)
        init_hidden = (torch.zeros(2, batch_size, self.hidden_size),
                       torch.zeros(2, batch_size, self.hidden_size))
        out, _ = self.rnn(x, init_hidden)
        out = self.out(out[:, -1, :])
        out = nn.functional.softmax(out, dim=1)  # Apply softmax to convert logits to probabilities
        return out

# Define the Discriminator architecture
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Discriminator, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, 1)

    def forward(self, x):
        print(x.shape)
        x = torch.relu(self.fc1(x))
        print(x.shape)
        x = torch.sigmoid(self.fc2(x))
        return x

# Training loop
def train(generator, discriminator, train_loader, g_criterion, d_criterion, g_optimizer, d_optimizer, num_epochs, clip_value=1.0):
    for epoch in range(num_epochs):
        generator.train()
        discriminator.train()
        for fingerprints in train_loader:
            if fingerprints is None:
                continue

            # Train Discriminator
            d_optimizer.zero_grad()
            real_labels = torch.ones(fingerprints.size(0), 1)
            fake_labels = torch.zeros(fingerprints.size(0), 1)
            real_outputs = discriminator(fingerprints)
            fake_fingerprints = generator(fingerprints)
            fake_outputs = discriminator(fake_fingerprints.detach())
            d_loss_real = d_criterion(real_outputs, real_labels)
            d_loss_fake = d_criterion(fake_outputs, fake_labels)
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            fake_outputs = discriminator(fake_fingerprints)
            g_loss = g_criterion(fake_outputs, real_labels)
            g_loss.backward()
            g_optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], G_Loss: {g_loss.item()}, D_Loss: {d_loss.item()}')

# Set hyperparameters
input_size = 2048  # Size of Morgan fingerprint
hidden_size = 256
output_size = 32  # Size of SMILES vocabulary
num_epochs = 10
learning_rate = 0.001

# Load your dataset

train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Initialize Generator and Discriminator
generator = Generator(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
discriminator = Discriminator(input_size=output_size, hidden_size=hidden_size)

# Define loss functions and optimizers
g_criterion = nn.BCELoss()
d_criterion = nn.BCELoss()
g_optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
d_optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Train the models
train(generator, discriminator, train_loader, g_criterion, d_criterion, g_optimizer, d_optimizer, num_epochs)


torch.Size([64, 2048])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x2048 and 32x256)