In [34]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [35]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [36]:
df = pd.read_csv("dataset/psc_severity_test.csv")
new_df = df[['def_text']].copy()
new_df.head()

Unnamed: 0,def_text
0,PscInspectionId: 1802364\n\nDeficiency/Finding...
1,PscInspectionId: 1736765\n\nDeficiency/Finding...
2,PscInspectionId: 1787907\n\nDeficiency/Finding...
3,PscInspectionId: 1691176\n\nDeficiency/Finding...
4,PscInspectionId: 1712454\n\nDeficiency/Finding...


In [37]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [38]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [39]:

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.def_text
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [40]:
test_dataset=new_df
print("TEST Dataset: {}".format(test_dataset.shape))
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TEST Dataset: (1101, 1)


In [41]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

testing_loader = DataLoader(testing_set, **test_params)

In [42]:
model.load_state_dict(torch.load("model.pth"))
model.eval()

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [43]:
test_df = pd.read_csv('dataset/psc_severity_test.csv')
test_df

Unnamed: 0,PscInspectionId,deficiency_code,def_text,InspectionDate,VesselId,PscAuthorityId,PortId,VesselGroup,age
0,1802364,14402,PscInspectionId: 1802364\n\nDeficiency/Finding...,2024-04-05,293691,9,936,Dry Bulk,9.593429
1,1736765,10199,PscInspectionId: 1736765\n\nDeficiency/Finding...,2023-08-17,272075,9,5237,Dry Bulk,25.210130
2,1787907,18204,PscInspectionId: 1787907\n\nDeficiency/Finding...,2024-02-15,302667,1,953,Dry Bulk,5.793292
3,1691176,14108,PscInspectionId: 1691176\n\nDeficiency/Finding...,2023-03-13,288591,7,1439,Oil,12.446270
4,1712454,5109,PscInspectionId: 1712454\n\nDeficiency/Finding...,2023-05-26,290457,2,1366,Dry Bulk,11.731691
...,...,...,...,...,...,...,...,...,...
1096,1721780,1139,PscInspectionId: 1721780\n\nDeficiency/Finding...,2023-06-26,277654,7,1797,Chemical,18.732375
1097,1674478,14604,PscInspectionId: 1674478\n\nDeficiency/Finding...,2023-01-11,303135,9,3152,Dry Bulk,2.529774
1098,1778352,18203,PscInspectionId: 1778352\n\nDeficiency/Finding...,2024-01-10,293467,7,1459,General Cargo,14.275154
1099,1643351,10112,PscInspectionId: 1643351\n\nDeficiency/Finding...,2022-09-15,293373,9,2135,Dry Bulk,10.176591


In [44]:
def validation(epoch):
    model.eval()
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            print('Number: ', _)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            # print(fin_outputs)
            for i in range(len(fin_outputs)):
                fin_outputs[i] = np.array(fin_outputs[i]).argmax()
            # print(fin_outputs)
            # print(fin_targets)
    return fin_outputs

In [45]:
for epoch in range(EPOCHS):
    outputs = validation(epoch)
print(outputs)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number:  0
Number:  1
Number:  2
Number:  3
Number:  4
Number:  5
Number:  6
Number:  7
Number:  8
Number:  9
Number:  10
Number:  11
Number:  12
Number:  13
Number:  14
Number:  15
Number:  16
Number:  17
Number:  18
Number:  19
Number:  20
Number:  21
Number:  22
Number:  23
Number:  24
Number:  25
Number:  26
Number:  27
Number:  28
Number:  29
Number:  30
Number:  31
Number:  32
Number:  33
Number:  34
Number:  35
Number:  36
Number:  37
Number:  38
Number:  39
Number:  40
Number:  41
Number:  42
Number:  43
Number:  44
Number:  45
Number:  46
Number:  47
Number:  48
Number:  49
Number:  50
Number:  51
Number:  52
Number:  53
Number:  54
Number:  55
Number:  56
Number:  57
Number:  58
Number:  59
Number:  60
Number:  61
Number:  62
Number:  63
Number:  64
Number:  65
Number:  66
Number:  67
Number:  68
Number:  69
Number:  70
Number:  71
Number:  72
Number:  73
Number:  74
Number:  75
Number:  76
Number:  77
Number:  78
Number:  79
Number:  80
Number:  81
Number:  82
Number:  83
Nu

In [46]:
test_df['predicted_severity'] = outputs

severity_to_int = {0 : "Low", 1 : "Medium", 2 : "High"}
test_df['predicted_severity'] = test_df['predicted_severity'].map(severity_to_int)

print(test_df)

      PscInspectionId  deficiency_code  \
0             1802364            14402   
1             1736765            10199   
2             1787907            18204   
3             1691176            14108   
4             1712454             5109   
...               ...              ...   
1096          1721780             1139   
1097          1674478            14604   
1098          1778352            18203   
1099          1643351            10112   
1100          1737032             5107   

                                               def_text InspectionDate  \
0     PscInspectionId: 1802364\n\nDeficiency/Finding...     2024-04-05   
1     PscInspectionId: 1736765\n\nDeficiency/Finding...     2023-08-17   
2     PscInspectionId: 1787907\n\nDeficiency/Finding...     2024-02-15   
3     PscInspectionId: 1691176\n\nDeficiency/Finding...     2023-03-13   
4     PscInspectionId: 1712454\n\nDeficiency/Finding...     2023-05-26   
...                                                

In [47]:
display(test_df)

Unnamed: 0,PscInspectionId,deficiency_code,def_text,InspectionDate,VesselId,PscAuthorityId,PortId,VesselGroup,age,predicted_severity
0,1802364,14402,PscInspectionId: 1802364\n\nDeficiency/Finding...,2024-04-05,293691,9,936,Dry Bulk,9.593429,Low
1,1736765,10199,PscInspectionId: 1736765\n\nDeficiency/Finding...,2023-08-17,272075,9,5237,Dry Bulk,25.210130,Low
2,1787907,18204,PscInspectionId: 1787907\n\nDeficiency/Finding...,2024-02-15,302667,1,953,Dry Bulk,5.793292,Low
3,1691176,14108,PscInspectionId: 1691176\n\nDeficiency/Finding...,2023-03-13,288591,7,1439,Oil,12.446270,Low
4,1712454,5109,PscInspectionId: 1712454\n\nDeficiency/Finding...,2023-05-26,290457,2,1366,Dry Bulk,11.731691,Low
...,...,...,...,...,...,...,...,...,...,...
1096,1721780,1139,PscInspectionId: 1721780\n\nDeficiency/Finding...,2023-06-26,277654,7,1797,Chemical,18.732375,Low
1097,1674478,14604,PscInspectionId: 1674478\n\nDeficiency/Finding...,2023-01-11,303135,9,3152,Dry Bulk,2.529774,Low
1098,1778352,18203,PscInspectionId: 1778352\n\nDeficiency/Finding...,2024-01-10,293467,7,1459,General Cargo,14.275154,Low
1099,1643351,10112,PscInspectionId: 1643351\n\nDeficiency/Finding...,2022-09-15,293373,9,2135,Dry Bulk,10.176591,Low


In [48]:
list(test_df['predicted_severity']).count('Low')

1100

In [None]:
test_df.to_csv('dataset/thunderbuddies.csv')