# Train a deep neural net to predict EC number 

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF


In [29]:
df = pd.read_csv("all_prepareddata_for_EzyPredict.csv")
columns_to_drop = ['UniprotID', 'EC number']
X = df.drop(columns=columns_to_drop)
y = df['EC number']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
y_test.unique()

array([2.3 , 3.5 , 2.5 , 1.6 , 2.7 , 4.1 , 3.1 , 2.6 , 3.6 , 1.2 , 6.1 ,
       3.2 , 4.2 , 1.1 , 1.11, 6.3 , 1.3 , 1.14, 1.5 , 2.4 , 4.3 , 6.2 ,
       2.1 , 7.1 , 5.4 , 1.13, 5.3 , 3.7 , 1.8 , 2.8 , 4.99, 1.4 ])

In [30]:
# Convert the numbers of EC to labels
label_mapping = {label: i for i, label in enumerate(y_train.unique())}
y_train_mapped = y_train.replace(label_mapping)
y_test_mapped = y_test.replace(label_mapping)

# convert into pytorch tensorts
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_mapped.values, dtype=torch.int64).squeeze()
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_mapped.values, dtype=torch.int64).squeeze()


## Make a Neural Network

In [33]:

class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 100)  # Example hidden layer of size 100
        self.fc2 = nn.Linear(100, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleNN(X_train.shape[1], len(label_mapping))



In [35]:
criterion = nn.CrossEntropyLoss() ## google told me this was good for classifcation
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/20], Loss: 3.1097
Epoch [2/20], Loss: 3.0859
Epoch [3/20], Loss: 3.0680
Epoch [4/20], Loss: 3.0514
Epoch [5/20], Loss: 3.0348
Epoch [6/20], Loss: 3.0181
Epoch [7/20], Loss: 3.0012
Epoch [8/20], Loss: 2.9842
Epoch [9/20], Loss: 2.9671
Epoch [10/20], Loss: 2.9499
Epoch [11/20], Loss: 2.9326
Epoch [12/20], Loss: 2.9151
Epoch [13/20], Loss: 2.8974
Epoch [14/20], Loss: 2.8794
Epoch [15/20], Loss: 2.8611
Epoch [16/20], Loss: 2.8427
Epoch [17/20], Loss: 2.8239
Epoch [18/20], Loss: 2.8049
Epoch [19/20], Loss: 2.7857
Epoch [20/20], Loss: 2.7662


In [37]:
# Model evaluation
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = accuracy_score(y_test_mapped, predicted.numpy())
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

ValueError: Classification metrics can't handle a mix of continuous and multiclass targets

In [39]:
print("Unique values in y_test_mapped:", np.unique(y_test_mapped))


Unique values in y_test_mapped: [ 0.    1.    2.    3.    3.7   4.    4.99  5.    6.    7.    8.    9.
 10.   11.   12.   13.   14.   15.   19.   20.   21.   22.   23.   24.
 25.   27.   29.   31.   33.   34.   35.   36.  ]


## Hyperparameter Tuning using bayesian optimisaton 
#### Modelling the loss function between predicted EC class and the true one using a gaussian process