In [1]:
import gc
import os
import pickle
import random
import joblib
import pandas as pd
# import polars as pd
from tqdm import tqdm

import numpy as np
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.model_selection import StratifiedKFold
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor

from sklearn.metrics import average_precision_score
from KAN import KANLinear

In [2]:
class CFG:

    PREPROCESS = False
    SHRINKING = False
    SHRINKING_SIZE = 0.3
    EPOCHS = 20 #20
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 1e-6

    NBR_FOLDS = 2
    SELECTED_FOLDS = [0]

    SEED = 42

In [3]:
# import tensorflow as tf
import torch
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    #tf.random.set_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

set_seeds(seed=CFG.SEED)

In [4]:
# 缩小数据集
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('data/train.parquet')

    if CFG.SHRINKING:
        # 按 'molecule_smiles' 分组，进行采样
        molecules = train_raw['molecule_smiles'].unique()
        sampled_molecules = np.random.choice(molecules, size=int(len(molecules) * CFG.SHRINKING_SIZE), replace=False)
        
        # 根据采样的分子过滤原数据
        train_raw = train_raw[train_raw['molecule_smiles'].isin(sampled_molecules)]

    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=60)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('output/train_enc_sampled.parquet')


In [5]:
if CFG.PREPROCESS:
    enc = {'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
           '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25, '=': 26,
           '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36}
    train_raw = pd.read_parquet('data/train.parquet')
    smiles = train_raw[train_raw['protein_name']=='BRD4']['molecule_smiles'].values
    assert (smiles!=train_raw[train_raw['protein_name']=='HSA']['molecule_smiles'].values).sum() == 0
    assert (smiles!=train_raw[train_raw['protein_name']=='sEH']['molecule_smiles'].values).sum() == 0
    def encode_smile(smile):
        tmp = [enc[i] for i in smile]
        tmp = tmp + [0]*(142-len(tmp))
        return np.array(tmp).astype(np.uint8)

    smiles_enc = joblib.Parallel(n_jobs=60)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    train = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
    train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
    train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
    train.to_parquet('output/train_enc.parquet')

    test_raw = pd.read_parquet('data/test.parquet')
    smiles = test_raw['molecule_smiles'].values

    smiles_enc = joblib.Parallel(n_jobs=60)(joblib.delayed(encode_smile)(smile) for smile in tqdm(smiles))
    smiles_enc = np.stack(smiles_enc)
    test = pd.DataFrame(smiles_enc, columns = [f'enc{i}' for i in range(142)])
    test.to_parquet('output/test_enc.parquet')

else:
    if CFG.SHRINKING:
        train = pd.read_parquet('output/train_enc_sampled.parquet')
        test = pd.read_parquet('output/test_enc.parquet')

    else:
        train = pd.read_parquet('output/train_enc.parquet')
        test = pd.read_parquet('output/test_enc.parquet')

In [6]:
class MyModel(pl.LightningModule):
    def __init__(self, input_dim=142, input_dim_embedding=37, hidden_dim=128, num_filters=32, output_dim=3, lr=1e-3, weight_decay=1e-6):
        super(MyModel, self).__init__()
        self.save_hyperparameters()

        # Embedding layer
        self.embedding = nn.Embedding(num_embeddings=self.hparams.input_dim_embedding, embedding_dim=self.hparams.hidden_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=self.hparams.hidden_dim, out_channels=self.hparams.num_filters, kernel_size=3, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=self.hparams.num_filters, out_channels=self.hparams.num_filters*2, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=self.hparams.num_filters*2, out_channels=self.hparams.num_filters*3, kernel_size=3, stride=1, padding=0)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        # Fully connected layers
        self.fc1 = KANLinear(self.hparams.num_filters*3, 1024)
        self.fc2 = KANLinear(1024, 1024)
        self.fc3 = KANLinear(1024, 512)
        self.output = KANLinear(512, self.hparams.output_dim)

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.embedding(x).permute(0,2,1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.global_max_pool(x).squeeze(2)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.dropout(x)
        x = self.output(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        return optimizer

In [7]:
from sklearn.model_selection import train_test_split
# 使用正常训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
FEATURES = [f'enc{i}' for i in range(142)]
TARGETS = ['bind1', 'bind2', 'bind3']
all_preds = []


# Train-validation split
train_idx, valid_idx = train_test_split(train.index, test_size=0.2, random_state=42)

# Convert pandas dataframes to PyTorch tensors
X_train = torch.tensor(train.loc[train_idx, FEATURES].values, dtype=torch.int)
y_train = torch.tensor(train.loc[train_idx, TARGETS].values, dtype=torch.float16)
X_val = torch.tensor(train.loc[valid_idx, FEATURES].values, dtype=torch.int)
y_val = torch.tensor(train.loc[valid_idx, TARGETS].values, dtype=torch.float16)

# Create TensorDatasets
train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_val, y_val)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=CFG.BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=CFG.BATCH_SIZE)

# Initialize the model
model = MyModel(lr=CFG.LR, weight_decay=CFG.WD)

# Define callbacks
early_stop_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=True)
checkpoint_callback = ModelCheckpoint(monitor="val_loss", dirpath="./ckpoint/KANLinear", filename="model", save_top_k=1, mode="min")
lr_monitor = LearningRateMonitor(logging_interval='epoch')

# Trainer setup
trainer = pl.Trainer(
    max_epochs=CFG.EPOCHS,
    callbacks=[early_stop_callback, checkpoint_callback, lr_monitor],
    devices=1,
    accelerator="gpu",  # Adjust based on your hardware
    enable_progress_bar=True,
)

# Train the model
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

# checkpoint_callback.best_model_path
# Load model onto the GPU
model = MyModel.load_from_checkpoint(checkpoint_callback.best_model_path).to(device)
# model = MyModel.load_from_checkpoint("./ckpoint/KANLinear/model.ckpt").to(device)

print("load ok")
model.eval()  # Set the model to evaluation mode

# Perform batched inference with data also on GPU
all_preds = []
all_targets = []

with torch.no_grad():  # Disable gradient computation for inference
    for batch in tqdm(valid_loader, desc="Validation Progress"):
        # Move data to GPU
        batch_X, batch_y = batch[0].to(device), batch[1].to(device)
        
        # Predict for the batch
        preds = model(batch_X)
        
        # Move predictions and targets to CPU for concatenation
        all_preds.append(preds.cpu())
        all_targets.append(batch_y.cpu())

# Concatenate all batches into single tensors
all_preds = torch.cat(all_preds, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Compute the score
score = average_precision_score(all_targets.numpy(), all_preds.numpy(), average='micro')
print('Score =', score)




GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision





d:\jupyter notebook\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory D:\zyh\bddm\protein_predict\ckpoint\KANLinear exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type              | Params | Mode 
--------------------------------------------------------------
0 | embedding       | Embedding         | 4.7 K  | train
1 | conv1           | Conv1d            | 12.3 K | train
2 | conv2           | Conv1d            | 6.2 K  | train
3 | conv3           | Conv1d            | 18.5 K | train
4 | global_max_pool | AdaptiveMaxPool1d | 0      | train
5 | fc1             | KANLinear         | 983 K  | train
6 | fc2             | KANLinear         | 10.5 M | train
7 | fc3             | KANLinear         | 5.2 M  | train
8 | output          | KANLinear         | 15.4 K | train
9 | dropout         | Dropout           | 0      | train
--------------------------------------------------------------
16.8 M    Trainable p

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\jupyter notebook\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
d:\jupyter notebook\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.014


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.013


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.013


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.013


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.013


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.013


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.012


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.012. Signaling Trainer to stop.


load ok


Validation Progress: 100%|██████████| 4806/4806 [04:08<00:00, 19.37it/s]


Score = 0.6526841793782533


In [8]:
# Step 2: Convert the DataFrame to a NumPy array and then to a Tensor
test_tensor = torch.tensor(test.loc[:,FEATURES].values, dtype=torch.int) # Use the appropriate dtype for your model

test_dataset = TensorDataset(test_tensor)
test_loader = DataLoader(test_dataset,batch_size=CFG.BATCH_SIZE,shuffle=False)
# preds = model(test_tensor)
# all_preds.append(preds)

# Perform batched inference
all_preds_test = []

model.eval()  # Set model to evaluation mode

with torch.no_grad():  # Disable gradient computation for inference
    for batch in tqdm(test_loader, desc="Test Data Inference Progress"):
        # Move batch to GPU
        batch_X = batch[0].to(device)
        
        # Predict for the batch
        preds = model(batch_X)
        
        # Move predictions to CPU and append to results
        all_preds_test.append(preds.cpu())  

# Concatenate all predictions into a single tensor
final_preds = torch.cat(all_preds_test, dim=0)

# Min-Max Normalization to scale predictions between 0 and 1
min_val = final_preds.min()
max_val = final_preds.max()
final_preds = (final_preds - min_val) / (max_val - min_val)

# Convert to NumPy for further processing
final_preds = final_preds.numpy()

# Output the final predictions
print("Inference completed. Predictions shape:", final_preds.shape)

Test Data Inference Progress: 100%|██████████| 409/409 [00:17<00:00, 22.83it/s]

Inference completed. Predictions shape: (1674896, 3)





In [9]:
tst = pd.read_parquet('data/test.parquet')
tst['binds'] = 0
tst.loc[tst['protein_name']=='BRD4', 'binds'] = final_preds[(tst['protein_name']=='BRD4').values, 0]
tst.loc[tst['protein_name']=='HSA', 'binds'] = final_preds[(tst['protein_name']=='HSA').values, 1]
tst.loc[tst['protein_name']=='sEH', 'binds'] = final_preds[(tst['protein_name']=='sEH').values, 2]
tst[['id', 'binds']].to_csv('submission_KANLinear.csv', index = False)

  tst.loc[tst['protein_name']=='BRD4', 'binds'] = final_preds[(tst['protein_name']=='BRD4').values, 0]
