# Train a deep neural net to predict EC number 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

In [2]:
## enzyme calss labels

# Define a dictionary for the main classes
class_dict = {
    1: "Oxidoreductase",
    2: "Transferase",
    3: "Hydrolase",
    4: "Lyase",
    5: "Isomerage",
    6: "Ligase",
    7: "Translocase"
}

def map_to_class(value):
    # Split the float into integer and decimal parts
    parts = str(value).split('.')
    main_class = class_dict[int(parts[0])]
    # Join the main class with the second digit
    return main_class + " " + parts[1]



In [3]:
df = pd.read_csv("all_prepareddata_for_EzyPredict.csv")
columns_to_drop = ['UniprotID', 'EC number']
X = df.drop(columns=columns_to_drop)
y = df['EC number']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply the mapping function to the y_test array
y_train = [map_to_class(val) for val in y_train]
y_test = [map_to_class(val) for val in y_test]

In [4]:
## encode labels

# Combine y_train and y_test
combined_y = np.concatenate([y_train, y_test])

# Encode the combined class labels to integers
encoder = LabelEncoder()
combined_y_mapped = encoder.fit_transform(combined_y)

# Split them back
y_train_mapped = combined_y_mapped[:len(y_train)]
y_test_mapped = combined_y_mapped[len(y_train):]

# Convert them to tensors
y_train_tensor = torch.tensor(y_train_mapped, dtype=torch.int64)
y_test_tensor = torch.tensor(y_test_mapped, dtype=torch.int64)

In [5]:
# Convert dataframes into numpy arrays, then into PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)



In [6]:
## how many classes do we have -- need to know to know how many neurons i should have
n_classes = len(np.unique(combined_y_mapped))
print(f"Number of classes: {n_classes}")


Number of classes: 45


## Make a Neural Network

In [7]:
## Define one neural network that can take input and be used againa nd again 

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=100, output_dim = n_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [8]:
model = SimpleNN(X_train.shape[1], hidden_dim= 100, output_dim = n_classes)


In [9]:
criterion = nn.CrossEntropyLoss() ## google told me this was good for classifcation
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/500], Loss: 3.8094
Epoch [2/500], Loss: 3.7494
Epoch [3/500], Loss: 3.6987
Epoch [4/500], Loss: 3.6511
Epoch [5/500], Loss: 3.6053
Epoch [6/500], Loss: 3.5606
Epoch [7/500], Loss: 3.5157
Epoch [8/500], Loss: 3.4713
Epoch [9/500], Loss: 3.4290
Epoch [10/500], Loss: 3.3898
Epoch [11/500], Loss: 3.3543
Epoch [12/500], Loss: 3.3225
Epoch [13/500], Loss: 3.2938
Epoch [14/500], Loss: 3.2677
Epoch [15/500], Loss: 3.2435
Epoch [16/500], Loss: 3.2210
Epoch [17/500], Loss: 3.2002
Epoch [18/500], Loss: 3.1814
Epoch [19/500], Loss: 3.1644
Epoch [20/500], Loss: 3.1495
Epoch [21/500], Loss: 3.1368
Epoch [22/500], Loss: 3.1253
Epoch [23/500], Loss: 3.1140
Epoch [24/500], Loss: 3.1022
Epoch [25/500], Loss: 3.0899
Epoch [26/500], Loss: 3.0769
Epoch [27/500], Loss: 3.0634
Epoch [28/500], Loss: 3.0495
Epoch [29/500], Loss: 3.0355
Epoch [30/500], Loss: 3.0218
Epoch [31/500], Loss: 3.0085
Epoch [32/500], Loss: 2.9955
Epoch [33/500], Loss: 2.9829
Epoch [34/500], Loss: 2.9704
Epoch [35/500], Loss: 2

Epoch [281/500], Loss: 0.2153
Epoch [282/500], Loss: 0.2133
Epoch [283/500], Loss: 0.2113
Epoch [284/500], Loss: 0.2093
Epoch [285/500], Loss: 0.2074
Epoch [286/500], Loss: 0.2055
Epoch [287/500], Loss: 0.2037
Epoch [288/500], Loss: 0.2018
Epoch [289/500], Loss: 0.2000
Epoch [290/500], Loss: 0.1983
Epoch [291/500], Loss: 0.1965
Epoch [292/500], Loss: 0.1948
Epoch [293/500], Loss: 0.1931
Epoch [294/500], Loss: 0.1914
Epoch [295/500], Loss: 0.1898
Epoch [296/500], Loss: 0.1882
Epoch [297/500], Loss: 0.1866
Epoch [298/500], Loss: 0.1850
Epoch [299/500], Loss: 0.1835
Epoch [300/500], Loss: 0.1820
Epoch [301/500], Loss: 0.1805
Epoch [302/500], Loss: 0.1791
Epoch [303/500], Loss: 0.1776
Epoch [304/500], Loss: 0.1762
Epoch [305/500], Loss: 0.1748
Epoch [306/500], Loss: 0.1734
Epoch [307/500], Loss: 0.1721
Epoch [308/500], Loss: 0.1707
Epoch [309/500], Loss: 0.1694
Epoch [310/500], Loss: 0.1681
Epoch [311/500], Loss: 0.1669
Epoch [312/500], Loss: 0.1656
Epoch [313/500], Loss: 0.1644
Epoch [314

In [10]:
# Model evaluation
model.eval() 
correct = 0
total = 0
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    total += y_test_tensor.size(0)
    correct += (predicted == y_test_tensor).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


Test Accuracy: 76.72%


In [None]:
predicted

## I couldn't get above about 77% accuracy with the simple model.. so now I'll make a more complicated NN


## Hyperparameter Tuning using bayesian optimisaton 
#### minimizing cross entropy loss between predicted EC and real EC

In [13]:
#!pip install bayesian-optimization
from bayes_opt import BayesianOptimization


def optimize_nn(lr_log, hidden_dim):
    lr = 10**lr_log
    model = SimpleNN(X_train_tensor.shape[1], int(hidden_dim), n_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    num_epochs = 500  
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
    
    return -loss.item()


In [21]:
pbounds = {
    'lr_log': (-10, 0),  
    'hidden_dim': (20, 250)  
}

In [None]:
optimizer = BayesianOptimization(
    f=optimize_nn,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(
    init_points=20,
    n_iter=100,
)

|   iter    |  target   | hidden... |  lr_log   |
-------------------------------------------------
| [0m1        [0m | [0m-3.248   [0m | [0m106.1    [0m | [0m-0.4929  [0m |
| [95m2        [0m | [95m-1.701   [0m | [95m188.4    [0m | [95m-4.013   [0m |
| [0m3        [0m | [0m-3.82    [0m | [0m55.88    [0m | [0m-8.44    [0m |
| [95m4        [0m | [95m-0.07338 [0m | [95m33.36    [0m | [95m-1.338   [0m |
| [95m5        [0m | [95m-0.06249 [0m | [95m158.3    [0m | [95m-2.919   [0m |
| [0m6        [0m | [0m-3.248   [0m | [0m24.73    [0m | [0m-0.3009  [0m |
| [0m7        [0m | [0m-3.793   [0m | [0m211.5    [0m | [0m-7.877   [0m |
| [0m8        [0m | [0m-3.83    [0m | [0m61.82    [0m | [0m-8.166   [0m |
| [0m9        [0m | [0m-3.322   [0m | [0m89.98    [0m | [0m-4.752   [0m |
| [0m10       [0m | [0m-3.797   [0m | [0m119.3    [0m | [0m-7.088   [0m |
| [0m11       [0m | [0m-3.794   [0m | [0m160.7    [0m | [0m-8

In [None]:

all_results = optimizer.res


In [None]:
hidden_dims = [res['params']['hidden_dim'] for res in all_results]
lr_logs = [res['params']['lr_log'] for res in all_results]
targets = [res['target'] for res in all_results]

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Creating a surface plot
surf = ax.plot_trisurf(hidden_dims, lr_logs, targets, cmap='viridis')

ax.set_xlabel('Hidden Dimension Size')
ax.set_ylabel('Log Learning Rate')
ax.set_zlabel('Negative Loss (Performance)')
ax.set_title('Bayesian Optimization Results')

fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)

plt.show()


## Train model with optimized hyperparameters

In [None]:
print(optimizer.max)


In [None]:
optimal_hidden_dim = int(round(optimizer.max['params']['hidden_dim']))
optimal_lr = 10**optimizer.max['params']['lr_log']

model = SimpleNN(X_train_tensor.shape[1], optimal_hidden_dim, n_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=optimal_lr)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

   

In [None]:
model.eval() 
correct = 0
total = 0
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    total += y_test_tensor.size(0)
    correct += (predicted == y_test_tensor).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")