In [54]:
import pandas as pd
import pickle
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [55]:
ccs_df = pd.read_csv('/home/robbe/DeepLCCS/data/peprec_CCS.csv')
X_train = pickle.load(open('/home/robbe/DeepLCCS/data/X_train_full.pickle', 'rb'))
global_feats_train = pickle.load(open('/home/robbe/DeepLCCS/data/global_feats_train_full.pickle', 'rb'))
X_test = pickle.load(open('/home/robbe/DeepLCCS/data/X_test_full.pickle', 'rb'))
global_feats_test = pickle.load(open('/home/robbe/DeepLCCS/data/global_feats_test_full.pickle', 'rb'))
ccs_df_train = pickle.load(open('/home/robbe/DeepLCCS/data/ccs_df_train_full.pickle', 'rb'))
ccs_df_test = pickle.load(open('/home/robbe/DeepLCCS/data/ccs_df_test_full.pickle', 'rb'))

In [56]:
print(X_train.shape)
X_train_reshaped = X_train.transpose(0, 2, 1)
X_train_reshaped.shape

(651338, 6, 60)


(651338, 60, 6)

In [57]:
y_train = ccs_df_train.loc[:, 'tr']
y_train_reshaped = y_train.values.reshape(-1, 1)
y_train_reshaped.shape

(651338, 1)

In [58]:
global_feats_train.shape

(651338, 19)

In [59]:
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [60]:
class MyModel(nn.Module):
    def __init__(self, input1_size, input2_size, lstm_hidden_size, global_dense_hidden_size, concat_dense_hidden_size, output_size, num_layers, activation):
        super(MyModel, self).__init__()

        # Bidirectional LSTM layer
        self.lstm = nn.LSTM(input1_size, lstm_hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)

        # Dense network for input 2
        self.global_dense = nn.Sequential(
            nn.Linear(input2_size, global_dense_hidden_size),
            activation
        )

        # Concatenated size for the fully connected layer
        concat_size = 2 * lstm_hidden_size + global_dense_hidden_size

        # Fully connected layer
        self.fc = nn.Sequential(
            nn.Linear(concat_size, concat_dense_hidden_size[0]),
            activation,
            nn.Linear(concat_dense_hidden_size[0], concat_dense_hidden_size[1]),
            activation,
            nn.Linear(concat_dense_hidden_size[1], output_size),
            activation
        )

    def forward(self, input1, input2):
        # Input 1 through Bidirectional LSTM
        lstm_output, _ = self.lstm(input1)

        # Get the last output of the LSTM
        lstm_output = lstm_output[:, -1, :]

        # Input 2 through Dense Network
        dense_output = self.global_dense(input2)

        # Concatenate the outputs
        concatenated = torch.cat((lstm_output, dense_output), dim=1)

        # Fully connected layer
        output = self.fc(concatenated)

        return output

# Define the parameters
input1_size = 6
input2_size = 19
lstm_hidden_size = 256
global_dense_hidden_size = 16
concat_dense_hidden_size = [64, 32]
output_size = 1  # Scalar value output
num_layers = 1
activation = nn.ReLU()  # You can adjust activation function here

# Create an instance of the model
model = MyModel(input1_size, input2_size, lstm_hidden_size, global_dense_hidden_size, concat_dense_hidden_size, output_size, num_layers, activation)

# Print the model architecture
print(model)
model.to(device)

MyModel(
  (lstm): LSTM(6, 256, batch_first=True, bidirectional=True)
  (global_dense): Sequential(
    (0): Linear(in_features=19, out_features=16, bias=True)
    (1): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=528, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): ReLU()
  )
)


MyModel(
  (lstm): LSTM(6, 256, batch_first=True, bidirectional=True)
  (global_dense): Sequential(
    (0): Linear(in_features=19, out_features=16, bias=True)
    (1): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=528, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): ReLU()
  )
)

In [61]:
import torch.optim as optim
# Define your loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # You can adjust the learning rate

In [62]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Define your training data (assuming you have it in numpy arrays)
# Replace X_train_input1, X_train_input2, y_train with your actual training data
X_train_input1 = torch.tensor(X_train_reshaped, dtype=torch.float32)  # Shape: (num_samples, sequence_length, input1_size)
X_train_input2 = torch.tensor(global_feats_train, dtype=torch.float32)  # Shape: (num_samples, input2_size)
y_train = torch.tensor(y_train_reshaped, dtype=torch.float32)         # Shape: (num_samples, output_size)
X_train_input1 = X_train_input1.to(device)
X_train_input2 = X_train_input2.to(device)
y_train = y_train.to(device)
# Split the data into training and validation sets
X_train_input1, X_val_input1, X_train_input2, X_val_input2, y_train, y_val = train_test_split(
    X_train_input1, X_train_input2, y_train, test_size=0.2, random_state=42)

# Create TensorDatasets for training and validation
train_dataset = TensorDataset(X_train_input1, X_train_input2, y_train)
val_dataset = TensorDataset(X_val_input1, X_val_input2, y_val)

# Define DataLoader for training and validation sets
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [63]:
import numpy as np
def mean_absolute_error(targets, predictions):
    """
    Calculate the mean absolute error (MAE).
    """
    return np.mean(np.abs(targets - predictions))

In [64]:
def train_model(model, criterion, optimizer, train_loader, valid_loader, num_epochs=10):
    mae_values = []
    val_mae_values = []
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode

        for inputs1_batch, inputs2_batch, targets_batch in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs1_batch, inputs2_batch)

            # Compute the loss
            loss = criterion(outputs, targets_batch)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        # Validate the model after each epoch
        validation_loss = validate_model(model, criterion, valid_loader)
        with torch.no_grad():
            train_predictions = []
            train_targets = []
            val_predictions = []
            val_targets = []
            for inputs1_batch, inputs2_batch, targets_batch in train_loader:
                outputs = model(inputs1_batch, inputs2_batch)
                train_predictions.extend(outputs.cpu().numpy())
                train_targets.extend(targets_batch.cpu().numpy())

            for inputs1_batch, inputs2_batch, targets_batch in valid_loader:
                outputs = model(inputs1_batch, inputs2_batch)
                val_predictions.extend(outputs.cpu().numpy())
                val_targets.extend(targets_batch.cpu().numpy())

            mae = mean_absolute_error(np.array(train_targets), np.array(train_predictions))
            mae_values.append(mae)
            val_mae = mean_absolute_error(np.array(val_targets), np.array(val_predictions))
            val_mae_values.append(val_mae)

        print(f'Epoch [{epoch+1}/{num_epochs}]: Loss: {loss:.4f}, MAE: {mae:.4f}, Validation Loss: {validation_loss:.4f}, Validation MAE: {val_mae:.4f}')

    print('Training finished!')


def validate_model(model, criterion, valid_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0

    with torch.no_grad():
        for inputs1_batch, inputs2_batch, targets_batch in valid_loader:
            # Forward pass
            outputs = model(inputs1_batch, inputs2_batch)

            # Compute the loss
            loss = criterion(outputs, targets_batch)

            total_loss += loss.item()

    # Calculate average validation loss
    avg_loss = total_loss / len(valid_loader)

    return avg_loss


In [65]:
train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10)

Epoch [1/10]: Loss: 1437.8615, MAE: 27.5497, Validation Loss: 1607.9731, Validation MAE: 27.7050
Epoch [2/10]: Loss: 1223.6923, MAE: 26.5822, Validation Loss: 1480.8574, Validation MAE: 26.7169
Epoch [3/10]: Loss: 1199.4352, MAE: 24.7058, Validation Loss: 1315.6968, Validation MAE: 24.8192
Epoch [4/10]: Loss: 1071.8512, MAE: 22.3273, Validation Loss: 1070.1194, Validation MAE: 22.4132
Epoch [5/10]: Loss: 809.8674, MAE: 17.8874, Validation Loss: 705.8277, Validation MAE: 17.9754
Epoch [6/10]: Loss: 731.6390, MAE: 14.8015, Validation Loss: 468.2140, Validation MAE: 14.8728
Epoch [7/10]: Loss: 463.9671, MAE: 13.4790, Validation Loss: 398.5749, Validation MAE: 13.5530
Epoch [8/10]: Loss: 493.9843, MAE: 13.1905, Validation Loss: 383.2412, Validation MAE: 13.2620
Epoch [9/10]: Loss: 485.4931, MAE: 13.2293, Validation Loss: 384.8172, Validation MAE: 13.2911
Epoch [10/10]: Loss: 332.3246, MAE: 12.7505, Validation Loss: 367.5258, Validation MAE: 12.8340
Training finished!


In [66]:
import numpy as np
# Assuming `model` is your trained PyTorch model and `val_loader` is your DataLoader for validation data

# Set the model to evaluation mode
model.eval()

# List to store validation predictions
val_predictions = []

# Iterate over the validation dataset
with torch.no_grad():
    for inputs1, inputs2, targets in val_loader:
        # Move inputs and targets to the same device as the model
        inputs1 = inputs1.to(device)
        inputs2 = inputs2.to(device)
        targets = targets.to(device)

        # Forward pass
        outputs = model(inputs1, inputs2)

        # Convert predictions to CPU and append to list
        val_predictions.extend(outputs.cpu().numpy())  # Assuming your targets are numpy arrays

# Convert the list of predictions to a numpy array
val_predictions = np.array(val_predictions)

# Now you can use val_predictions for further analysis or visualization

In [67]:
val_predictions

array([[370.2365 ],
       [496.42847],
       [457.5915 ],
       ...,
       [509.1925 ],
       [384.73877],
       [555.34143]], dtype=float32)

In [68]:
y_val

tensor([[363.9794],
        [505.2887],
        [454.0214],
        ...,
        [541.1950],
        [388.6454],
        [516.8959]], device='cuda:0')

In [3]:
import pickle
global_feats_train = pickle.load(
        open("/home/robbe/DeepLCCS/data/global_feats_train_full-onlyDeepLCtrainset.pickle", "rb")
    )

In [5]:
global_feats_train.shape

(651338, 8)

In [2]:
import itertools

values = [32, 64, 128, 256, 512, 1024]
combinations = list(itertools.product(values, repeat=3))

print("Number of combinations:", len(combinations))
print("Combinations:", combinations)

for i in combinations:
    print(list(i))

Number of combinations: 216
Combinations: [(32, 32, 32), (32, 32, 64), (32, 32, 128), (32, 32, 256), (32, 32, 512), (32, 32, 1024), (32, 64, 32), (32, 64, 64), (32, 64, 128), (32, 64, 256), (32, 64, 512), (32, 64, 1024), (32, 128, 32), (32, 128, 64), (32, 128, 128), (32, 128, 256), (32, 128, 512), (32, 128, 1024), (32, 256, 32), (32, 256, 64), (32, 256, 128), (32, 256, 256), (32, 256, 512), (32, 256, 1024), (32, 512, 32), (32, 512, 64), (32, 512, 128), (32, 512, 256), (32, 512, 512), (32, 512, 1024), (32, 1024, 32), (32, 1024, 64), (32, 1024, 128), (32, 1024, 256), (32, 1024, 512), (32, 1024, 1024), (64, 32, 32), (64, 32, 64), (64, 32, 128), (64, 32, 256), (64, 32, 512), (64, 32, 1024), (64, 64, 32), (64, 64, 64), (64, 64, 128), (64, 64, 256), (64, 64, 512), (64, 64, 1024), (64, 128, 32), (64, 128, 64), (64, 128, 128), (64, 128, 256), (64, 128, 512), (64, 128, 1024), (64, 256, 32), (64, 256, 64), (64, 256, 128), (64, 256, 256), (64, 256, 512), (64, 256, 1024), (64, 512, 32), (64, 512, 