In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from evaluation_reg import *
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, roc_curve, precision_score, f1_score,make_scorer
from sklearn.metrics import matthews_corrcoef
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
my_data = pd.read_csv('MAD_dataset_12.csv')

In [31]:
print(my_data.shape)

(371782, 1684)


In [32]:
np.random.seed(42)
df_0 = my_data[my_data.iloc[:,1] == 0]
df_1 = my_data[my_data.iloc[:,1] == 1]
#n_samples_per_class = round((len(my_data[my_data.iloc[:,1] == 1])))
n_samples_per_class = round((len(my_data[my_data.iloc[:,1] == 1])/10)*9)
train_0 = df_0.sample(n=n_samples_per_class, random_state=42)
train_1 = df_1.sample(n=n_samples_per_class, random_state=42)
train = pd.concat([train_0, train_1])
test = my_data.drop(train.index)
X_train = train.iloc[:, 2:]
y_train = train.iloc[:, 1]
X_test = test.iloc[:, 2:]
y_test = test.iloc[:, 1]

In [33]:
class DiabetesDataset(Dataset):
    def __init__(self,X,y):
        self.len = X.shape[0]
        self.x_data = torch.from_numpy(X.values)
        self.y_data = torch.from_numpy(y.values)
    def __getitem__(self, index):
        return self.x_data[index],self.y_data[index]
    def __len__(self):
        return self.len

train_dataset = DiabetesDataset(X_train,y_train)


train_loader = DataLoader(dataset=train_dataset,batch_size=128,shuffle=True,num_workers=2,drop_last=True)
test_dataset = DiabetesDataset(X_test,y_test)
test_loader = DataLoader(dataset=test_dataset,batch_size=128,shuffle=True,num_workers=2,drop_last=True)


In [34]:
class GlycoproteinProphet(nn.Module):
    def __init__(self):
        super(GlycoproteinProphet, self).__init__()
        self.prot_fc1 = nn.Linear(1280, 64)
        self.prot_fc2 = nn.Linear(64, 32)
        self.prot_dropout1 = nn.Dropout(0.3)
        self.prot_dropout2 = nn.Dropout(0.2)
        self.bn_prot1 = nn.BatchNorm1d(64)
        self.bn_prot2 = nn.BatchNorm1d(32)
        self.activation_fn = nn.GELU()
        self.glycan_fc1 = nn.Linear(402, 64)
        self.glycan_lstm = nn.LSTM(128, 64, 2, batch_first=True)
        self.conv1 = nn.Conv1d(128, 64, 1)
        self.glycan_rnn = nn.RNN(64, 64, 2)
        self.glycan_f2 = nn.Linear(402, 32)
        self.bn_glycan1 = nn.BatchNorm1d(32)

        #self.bn_fc1 = nn.Linear(64, 32)
        #medthod1
        self.bn_fc1 = nn.Linear(64, 32)
        self.bn_fc2 = nn.Linear(32, 16)
        self.bn_fc3 = nn.Linear(16, 1)
        self.bn_relu = nn.ReLU()
        # Attention

        self.W_query = nn.Linear(402, 64)
        self.W_key = nn.Linear(402, 64)
        self.W_value = nn.Linear(402, 64)
        self.softmax = nn.Softmax(dim=1)
        self.attention_glycan_fc1 = nn.Linear(466, 32)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input):
        prot_X_train_tensor = input[:, 0:1280].to(device)
        glycan_X_train_tensor = input[:, 1280:1684].to(device)
        prot_X_train_tensor = prot_X_train_tensor.float()
        glycan_X_train_tensor = glycan_X_train_tensor.float()
        x = self.prot_fc1(prot_X_train_tensor)
        prot1 = self.bn_prot1(self.prot_dropout1(x))
        prot2 = self.bn_prot2(self.prot_dropout2(self.prot_fc2(prot1)))
       
        # Attention
        query = self.W_query(glycan_X_train_tensor)
        key = self.W_key(glycan_X_train_tensor)
        value = self.W_value(glycan_X_train_tensor)
        key = key.transpose(0, 1)
        scores = torch.matmul(query, key)
        # attention_weights = torch.exp(scores)/torch.sum(scores, dim=1, keepdim=True)
        # attention_weights = self.softmax(scores)
        attention_weights = F.normalize(scores, p=2, dim=1)
        weighted_values = torch.matmul(attention_weights, value)
        #print(weighted_values.shape)
        glycan_feature = torch.cat((glycan_X_train_tensor, weighted_values), dim=1)
        glycan = self.attention_glycan_fc1(glycan_feature)
        #glycan = self.glycan_f2(glycan_X_train_tensor)
        h_n = torch.cat((prot2, glycan), 1)
        #x = F.sigmoid(self.bn_fc1(h_n))
        x = F.sigmoid(self.bn_fc3(self.bn_relu(self.bn_fc2(self.bn_relu(self.bn_fc1(h_n))))))
        #x = nn.functional.softmax(self.bn_fc3(self.bn_relu(self.bn_fc2(self.bn_relu(self.bn_fc1(h_n))))))
        return x

In [35]:
GlycoproteinProphet = GlycoproteinProphet().to(device)
# 定义损失函数和优化
criterion = nn.BCELoss().to(device)
#criterion = nn.CrossEntropyLoss().to(device)
#criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(GlycoproteinProphet.parameters(), lr=0.001)

# 学习率调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)

# 保存每个epoch的训练损失和验证损失


num_epochs = 200

In [36]:
train = []
for epoch in range(num_epochs):
    train_losses = []
    #train_pred = []
    #train_labels = []
    GlycoproteinProphet.train().to(device)
    for data in train_loader:
        optimizer.zero_grad()
        inputs,lables = data
        lables = lables.float().to(device)
        #lables = lables.type(torch.long).to(device)
        #lables = lables.to(device)
        #print(glycan_X_train_tensor.shape)
        outputs= GlycoproteinProphet(inputs).squeeze().to(device)
        #print(outputs)
        loss = criterion(outputs, lables).to(device)
        #print(epoch,loss.item())
        loss.backward()
        optimizer.step()
        #scheduler.step() # 更新学习率
        train_losses.append(loss.item())# 记录训练损失
        train.append(np.mean(train_losses))
    #scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {np.mean(train_losses):.4f}')

Epoch [10/200], Loss: 0.3537
Epoch [20/200], Loss: 0.3286
Epoch [30/200], Loss: 0.3144
Epoch [40/200], Loss: 0.3063
Epoch [50/200], Loss: 0.2998
Epoch [60/200], Loss: 0.2941
Epoch [70/200], Loss: 0.2885
Epoch [80/200], Loss: 0.2834
Epoch [90/200], Loss: 0.2804
Epoch [100/200], Loss: 0.2764
Epoch [110/200], Loss: 0.2741
Epoch [120/200], Loss: 0.2711
Epoch [130/200], Loss: 0.2694
Epoch [140/200], Loss: 0.2660
Epoch [150/200], Loss: 0.2630
Epoch [160/200], Loss: 0.2620
Epoch [170/200], Loss: 0.2605
Epoch [180/200], Loss: 0.2583
Epoch [190/200], Loss: 0.2563
Epoch [200/200], Loss: 0.2548


In [37]:
result_test = []
lables_test = []
GlycoproteinProphet.eval().to(device)
with torch.no_grad():
    for data in test_loader:
        inputs, lables = data
        lables = lables.float().to(device)
        # print(glycan_X_train_tensor.shape)
        outputs = GlycoproteinProphet(inputs).squeeze().to(device)
        result_test.append(outputs.tolist())
        lables_test.append(lables.tolist())
my_result = [item for sublist in result_test for item in sublist]
my_labels = [item for sublist in lables_test for item in sublist]

In [38]:
my_test_pred_test = []
for i in my_result:
    if i > 0.5:
        my_test_pred_test.append(1)
    else:
        my_test_pred_test.append(0)