In [None]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
# from torch import nn, einsum
# from einops import rearrange, repeat
# from einops.layers.torch import Rearrange
import pytorch_lightning as pl
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from sklearn.metrics import average_precision_score

In [2]:
class CFG:

    PREPROCESS = False
    SHRINKING = False
    SHRINKING_SIZE = 0.3
    EPOCHS = 15 #20
    BATCH_SIZE = 4096
    LR = 1e-4
    WD = 1e-6

    NBR_FOLDS = 2
    SELECTED_FOLDS = [0]

    SEED = 42

In [3]:
# import tensorflow as tf
import torch
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    #tf.random.set_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

In [4]:
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('data/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=60)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('output/train_enc.parquet')

    test_raw = pd.read_parquet('data/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=60)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('output/test_enc.parquet')

else:
    if CFG.SHRINKING:
        train = pd.read_parquet('output/train_enc_sampled.parquet')
        test = pd.read_parquet('output/test_enc.parquet')

    else:
        train = pd.read_parquet('output/train_enc.parquet')
        test = pd.read_parquet('output/test_enc.parquet')


In [5]:
# class Residual(nn.Module):
#     def __init__(self, fn):
#         super().__init__()
#         self.fn = fn

#     def forward(self, x, **kwargs):
#         return self.fn(x, **kwargs) + x


# class PreNorm(nn.Module):
#     def __init__(self, dim, fn):
#         super().__init__()
#         self.norm = nn.LayerNorm(dim)
#         self.fn = fn

#     def forward(self, x, **kwargs):
#         return self.fn(self.norm(x), **kwargs)


# class FeedForward(nn.Module):
#     def __init__(self, dim, hidden_dim, dropout=0.):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(dim, hidden_dim),
#             nn.GELU(),
#             nn.Dropout(dropout),
#             nn.Linear(hidden_dim, dim),
#             nn.Dropout(dropout))

#     def forward(self, x):
#         return self.net(x)


# class Attention(nn.Module):
#     def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
#         super().__init__()
#         inner_dim = dim_head * heads
#         project_out = not (heads == 1 and dim_head == dim)

#         self.heads = heads
#         self.scale = dim_head ** -0.5

#         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

#         self.to_out = nn.Sequential(
#             nn.Linear(inner_dim, dim),
#             nn.Dropout(dropout)
#         ) if project_out else nn.Identity()

#     def forward(self, x, mask=None):
#         b, n, _, h = *x.shape, self.heads
#         qkv = self.to_qkv(x).chunk(3, dim=-1)
#         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)

#         dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
#         mask_value = -torch.finfo(dots.dtype).max

#         if mask is not None:
#             mask = F.pad(mask.flatten(1), (1, 0), value=True)
#             assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
#             mask = rearrange(mask, 'b i -> b () i ()') * rearrange(mask, 'b j -> b () () j')
#             dots.masked_fill_(~mask, mask_value)
#             del mask

#         attn = dots.softmax(dim=-1)

#         out = einsum('b h i j, b h j d -> b h i d', attn, v)
#         out = rearrange(out, 'b h n d -> b n (h d)')
#         out = self.to_out(out)
#         return out


# class Transformer(nn.Module):
#     def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
#         super().__init__()
#         self.layers = nn.ModuleList([])
#         for _ in range(depth):
#             self.layers.append(nn.ModuleList([
#                 Residual(PreNorm(dim, Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout))),
#                 Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout)))]))

#     def forward(self, x, mask=None):
#         for attn, ff in self.layers:
#             x = attn(x, mask=mask)
#             x = ff(x)
#         return x

In [6]:
def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv1d):
        nn.init.xavier_normal_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
class CNNTransformerModel(pl.LightningModule):
    def __init__(self, input_dim=142, input_dim_embedding=37, hidden_dim=128, num_filters=32, output_dim=3, lr=1e-4, weight_decay=1e-6):
        super(CNNTransformerModel, self).__init__()
        self.save_hyperparameters()
        # 在模型中应用初始化
        self.apply(init_weights)
        # Embedding layer
        self.embedding = nn.Embedding(num_embeddings=self.hparams.input_dim_embedding, embedding_dim=self.hparams.hidden_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=self.hparams.hidden_dim, out_channels=self.hparams.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.hparams.num_filters, out_channels=self.hparams.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.hparams.num_filters*2, out_channels=self.hparams.num_filters*3, kernel_size=3, stride=1, padding=0)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        # Transformer layers
        # Create a transformer encoder layer and a transformer encoder
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=self.hparams.num_filters*3, nhead=4, dropout=0.1, norm_first=True)  # 4 heads for multi-head attention
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=2) 
        # self.transformer_patch = Transformer(dim=self.hparams.num_filters*3, depth=2, heads=4, dim_head=128, mlp_dim=self.hparams.num_filters*3, dropout=0.1)

        # Fully connected layers
        # self.fc1 = nn.Linear(self.hparams.num_filters * self.hparams.input_dim, 512)
        self.fc1 = nn.Linear(self.hparams.num_filters*3, 48)
        # self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(48, 24)
        self.output = nn.Linear(24, self.hparams.output_dim)

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # mask=None
        # Embedding layer
        x = self.embedding(x).permute(0,2,1)
        # Add a batch dimension (since CNN expects 4D input, including batch size)
        # x = x.unsqueeze(1)  # (batch_size, 1, seq_len, hidden_dim)
        
        # CNN layer
        x = F.relu(self.conv1(x))  # Apply 1st Conv layer
        x = F.relu(self.conv2(x))  # Apply 2nd Conv layer
        x = F.relu(self.conv3(x))
        x = self.global_max_pool(x).squeeze(2)
        x = nn.LayerNorm(x)

        # Transformer layer
        # Transformer expects input of shape (seq_len, batch_size, feature_dim)
        x = x.unsqueeze(0)  # Add batch dimension to match Transformer input shape: (1, batch_size, num_filters*3)
        
        # x = self.transformer_patch(x, mask)  # Apply Transformer Encoder
        x = self.transformer(x)
        x = nn.LayerNorm(x)
        x = x.squeeze(0)  # Remove the sequence dimension after transformer
        # Flatten the output from transformer
        x = x.flatten(1)  # Flatten (batch_size, seq_len * num_filters)

        # Fully connected layers with ReLU activations
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        # x = F.relu(self.fc2(x))
        # x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)

        

        x = self.output(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        return optimizer
    


In [None]:
from sklearn.model_selection import train_test_split
# 使用正常训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']
all_preds = []


# Train-validation split
train_idx, valid_idx = train_test_split(train.index, test_size=0.2, random_state=42)

# Convert pandas dataframes to PyTorch tensors
X_train = torch.tensor(train.loc[train_idx, FEATURES].values, dtype=torch.int)
y_train = torch.tensor(train.loc[train_idx, TARGETS].values, dtype=torch.float16)
X_val = torch.tensor(train.loc[valid_idx, FEATURES].values, dtype=torch.int)
y_val = torch.tensor(train.loc[valid_idx, TARGETS].values, dtype=torch.float16)

# # Check if there are NaN values in the input features or target values
# print("NaN in X_train:", torch.isnan(X_train).sum().item())
# print("NaN in y_train:", torch.isnan(y_train).sum().item())
# print("NaN in X_val:", torch.isnan(X_val).sum().item())
# print("NaN in y_val:", torch.isnan(y_val).sum().item())

# Create TensorDatasets
train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_val, y_val)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=CFG.BATCH_SIZE)

# Initialize the model
model = CNNTransformerModel(lr=CFG.LR, weight_decay=CFG.WD)

# Define callbacks
early_stop_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True)
checkpoint_callback = ModelCheckpoint(monitor="val_loss", dirpath="./ckpoint/Transformer", filename="model", save_top_k=1, mode="min")
lr_monitor = LearningRateMonitor(logging_interval='epoch')

# Trainer setup
trainer = pl.Trainer(
    max_epochs=CFG.EPOCHS,
    callbacks=[early_stop_callback, checkpoint_callback, lr_monitor],
    devices=1,
    accelerator="gpu",  # Adjust based on your hardware
    enable_progress_bar=True,
)

# Train the model
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader)


# Load model onto the GPU
# model = CNNTransformerModel.load_from_checkpoint("./ckpoint/Transformer/model.ckpt").to(device)
model = CNNTransformerModel.load_from_checkpoint(checkpoint_callback.best_model_path).to(device)
print("load ok")
model.eval()  # Set the model to evaluation mode

# Perform batched inference with data also on GPU
all_preds = []
all_targets = []

with torch.no_grad():  # Disable gradient computation for inference
    for batch in tqdm(valid_loader, desc="Validation Progress"):
        # Move data to GPU
        batch_X, batch_y = batch[0].to(device), batch[1].to(device)
        
        # Predict for the batch
        preds = model(batch_X)
        
        # Move predictions and targets to CPU for concatenation
        all_preds.append(preds.cpu())
        all_targets.append(batch_y.cpu())

# Concatenate all batches into single tensors
all_preds = torch.cat(all_preds, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Compute the score
score = average_precision_score(all_targets.numpy(), all_preds.numpy(), average='micro')
print('Score =', score)




GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision





d:\jupyter notebook\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory D:\zyh\bddm\protein_predict\ckpoint\Transformer exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name              | Type                    | Params | Mode 
-----------------------------------------------------------------------
0  | embedding         | Embedding               | 4.7 K  | train
1  | conv1             | Conv1d                  | 12.3 K | train
2  | conv2             | Conv1d                  | 6.2 K  | train
3  | conv3             | Conv1d                  | 18.5 K | train
4  | global_max_pool   | AdaptiveMaxPool1d       | 0      | train
5  | transformer_layer | TransformerEncoderLayer | 432 K  | train
6  | transformer       | TransformerEncoder      | 865 K  | train
7  | fc1               | Linear                  | 4.7 K  | train
8  | fc3               | Linear                  | 1.2 K  | train
9  | output            | Linear         

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\jupyter notebook\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


: 

In [None]:
# Step 2: Convert the DataFrame to a NumPy array and then to a Tensor
test_tensor = torch.tensor(test.loc[:,FEATURES].values, dtype=torch.int) # Use the appropriate dtype for your model

test_dataset = TensorDataset(test_tensor)
test_loader = DataLoader(test_dataset,batch_size=CFG.BATCH_SIZE,shuffle=False)
# preds = model(test_tensor)
# all_preds.append(preds)

# Perform batched inference
all_preds_test = []

model.eval()  # Set model to evaluation mode

with torch.no_grad():  # Disable gradient computation for inference
    for batch in tqdm(test_loader, desc="Test Data Inference Progress"):
        # Move batch to GPU
        batch_X = batch[0].to(device)
        
        # Predict for the batch
        preds = model(batch_X)
        
        # Move predictions to CPU and append to results
        all_preds_test.append(preds.cpu())  

# Concatenate all predictions into a single tensor
final_preds = torch.cat(all_preds_test, dim=0)

# Min-Max Normalization to scale predictions between 0 and 1
min_val = final_preds.min()
max_val = final_preds.max()
final_preds = (final_preds - min_val) / (max_val - min_val)

# Convert to NumPy for further processing
final_preds = final_preds.numpy()

# Output the final predictions
print("Inference completed. Predictions shape:", final_preds.shape)

Test Data Inference Progress: 100%|██████████| 409/409 [00:08<00:00, 47.40it/s]

Inference completed. Predictions shape: (1674896, 3)





In [None]:
tst = pd.read_parquet('data/test.parquet')
tst['binds'] = 0
tst.loc[tst['protein_name']=='BRD4', 'binds'] = final_preds[(tst['protein_name']=='BRD4').values, 0]
tst.loc[tst['protein_name']=='HSA', 'binds'] = final_preds[(tst['protein_name']=='HSA').values, 1]
tst.loc[tst['protein_name']=='sEH', 'binds'] = final_preds[(tst['protein_name']=='sEH').values, 2]
tst[['id', 'binds']].to_csv('submission_transformer.csv', index = False)

  tst.loc[tst['protein_name']=='BRD4', 'binds'] = final_preds[(tst['protein_name']=='BRD4').values, 0]
