In [9]:
# Dependencies

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from keras.utils import to_categorical


import transformers
from transformers import AutoModel, AutoTokenizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [10]:
class TransformerModel(nn.Module):
    def __init__(self, transformer, hidden_dim=16, n_classes=12):
        super(TransformerModel, self).__init__()
        self.transformer = transformer
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes

        self.classifier = nn.Linear(transformer.config.hidden_size, n_classes)
    
    def forward(self, input_word_ids):
        transformer_output = self.transformer(input_word_ids)

        cls_token = transformer_output.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_token)
        probabilities = torch.nn.functional.softmax(logits, dim=1)

        return probabilities


In [11]:
def regular_encode(texts, tokenizer, maxlen=16):

    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [12]:
EPOCHS = 10
BATCH_SIZE = 10
MAX_LEN = 16

MODEL = 'bert-base-uncased'
transformer_layer = AutoModel.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)



In [14]:
df = pd.read_csv("../Data/processedData.csv")

label_encoder = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(df[['processedMessage','fingers','tail']],
                                                    df['species_group'], random_state=33, test_size=0.2)

label_encoder = LabelEncoder()
label_encoder.fit(df['species_group'])

y_train = to_categorical(label_encoder.transform(y_train),num_classes=12)
y_test = to_categorical(label_encoder.transform(y_test),num_classes=12)

x_train_msg = regular_encode(list(X_train['processedMessage']), tokenizer, maxlen=16)
x_test_msg = regular_encode(list(X_test['processedMessage']), tokenizer, maxlen=16)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
class MessageDataset(Dataset):
    
    def __init__(self, messages, labels):
        self.messages = messages
        self.labels = labels

    def __len__(self):
        return len(self.messages)
    
    def __getitem__(self, index):
        message = torch.tensor(self.messages[index], dtype=torch.long)
        label = torch.tensor(self.labels[index], dtype=torch.float32)
        return message, label
    
msg_train_dataset = MessageDataset(x_train_msg, y_train)
msg_test_dataset = MessageDataset(x_test_msg, y_test)

train_loader = DataLoader(msg_train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(msg_test_dataset, batch_size=32, shuffle=False)

In [16]:
model = TransformerModel(transformer_layer)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        messages, labels = batch

        optimizer.zero_grad()
        outputs = model(messages)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 2.363334399003249
Epoch 2, Loss: 2.1423667210799
Epoch 3, Loss: 2.1130019976542544
Epoch 4, Loss: 2.1100034346947303
Epoch 5, Loss: 2.1178114230816183
Epoch 6, Loss: 2.087646612754235
Epoch 7, Loss: 2.078140744796166
Epoch 8, Loss: 2.0767063727745643
Epoch 9, Loss: 2.1023700604071984
Epoch 10, Loss: 2.032600916348971


In [17]:
all_probs = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        messages, labels = batch

        probabilities = model(messages)
        all_probs.append(probabilities)

all_probs = torch.cat(all_probs, dim=0)

all_probs

tensor([[1.5057e-04, 1.4743e-04, 2.1722e-04,  ..., 8.2423e-05, 9.8102e-05,
         5.5481e-04],
        [1.9425e-04, 6.8821e-04, 4.1634e-04,  ..., 2.4839e-03, 9.8222e-05,
         8.0021e-04],
        [3.9960e-05, 8.7018e-05, 1.0069e-04,  ..., 9.9786e-01, 7.3613e-05,
         1.3256e-03],
        ...,
        [1.0109e-04, 9.9866e-01, 1.2894e-04,  ..., 1.2206e-04, 1.1233e-04,
         1.7141e-04],
        [2.0625e-04, 9.6805e-05, 4.5056e-04,  ..., 1.2190e-04, 1.4898e-04,
         3.8465e-04],
        [4.2544e-05, 9.5662e-05, 1.1015e-04,  ..., 9.9726e-01, 7.6879e-05,
         1.8229e-03]])

In [18]:
from sklearn.metrics import classification_report

y_pred = np.argmax(np.array(all_probs), axis=1)
y_test_labels = np.argmax(np.array(y_test), axis=1)

print(classification_report(np.array(y_pred), y_test_labels))

              precision    recall  f1-score   support

           0       0.12      1.00      0.22         1
           1       0.67      0.55      0.60        11
           2       0.00      0.00      0.00         0
           3       0.50      0.40      0.44        10
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       1.00      0.62      0.77        24
           7       1.00      0.71      0.83        17
           8       0.29      0.50      0.36         4
           9       1.00      0.68      0.81        22
          10       0.50      0.36      0.42        11
          11       0.00      0.00      0.00         0

    accuracy                           0.59       100
   macro avg       0.42      0.40      0.37       100
weighted avg       0.82      0.59      0.68       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Finger probability array

from sklearn.naive_bayes import GaussianNB

y_train_class = np.argmax(y_train,axis=1)
y_test_class = np.argmax(y_test,axis=1)

gnb_fingers = GaussianNB()
gnb_fingers.fit(X_train['fingers'].values.reshape(-1, 1),y_train_class)

y_pred_fingers = gnb_fingers.predict_proba(X_test['fingers'].values.reshape(-1, 1))
y_pred_fingers

array([[1.28688701e-01, 2.10309570e-02, 3.82114380e-34, ...,
        2.03014076e-01, 1.16654860e-01, 6.29651801e-02],
       [1.51433910e-01, 1.24724799e-01, 4.06273374e-16, ...,
        1.20660337e-01, 7.93765239e-02, 2.65726190e-02],
       [1.28688701e-01, 2.10309570e-02, 3.82114380e-34, ...,
        2.03014076e-01, 1.16654860e-01, 6.29651801e-02],
       ...,
       [3.08284066e-02, 2.92249699e-01, 5.20175990e-01, ...,
        1.19237968e-03, 4.69583723e-03, 9.93042492e-04],
       [7.00135816e-02, 2.23223595e-01, 3.35191456e-05, ...,
        1.62062653e-02, 2.02512148e-02, 4.96073193e-03],
       [1.51433910e-01, 1.24724799e-01, 4.06273374e-16, ...,
        1.20660337e-01, 7.93765239e-02, 2.65726190e-02]])

In [20]:
# Tail probability array

from sklearn.naive_bayes import MultinomialNB

label_encoder = LabelEncoder()
label_encoder.fit(df['tail'])

X_train_tail = (label_encoder.transform(X_train['tail'])).reshape(-1,1)
X_test_tail = label_encoder.transform(X_test['tail']).reshape(-1,1)

mnb_tail = MultinomialNB()
mnb_tail.fit(X_train_tail, y_train_class)

y_pred_tail = mnb_tail.predict_proba(X_test_tail)

y_pred_tail

array([[0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075],
       [0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075],
       [0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075],
       ...,
       [0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075],
       [0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075],
       [0.1   , 0.115 , 0.0375, ..., 0.1125, 0.09  , 0.1075]])

In [21]:
y_pred_msg_log = np.log(all_probs)
y_pred_fingers_log = np.log(y_pred_fingers)
y_pred_tail_log = np.log(y_pred_tail)

class_counts = df['species_group'].value_counts()
total_samples = len(df)

class_probabilities = np.array(class_counts / total_samples)

final_probabilities = y_pred_msg_log + y_pred_fingers_log + y_pred_tail_log - np.log(class_probabilities)

y_pred_final_classes = np.argmax(final_probabilities, axis=1)
print(classification_report(y_test_labels,y_pred_final_classes))

              precision    recall  f1-score   support

           0       1.00      0.12      0.22         8
           1       0.60      0.67      0.63         9
           2       0.00      0.00      0.00         5
           3       0.60      0.38      0.46         8
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           6       0.56      1.00      0.71        15
           7       0.75      1.00      0.86        12
           8       0.40      0.29      0.33         7
           9       0.71      1.00      0.83        15
          10       0.33      0.50      0.40         8
          11       0.33      0.12      0.18         8

    accuracy                           0.59       100
   macro avg       0.44      0.42      0.39       100
weighted avg       0.54      0.59      0.52       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
