In [166]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [167]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [168]:
df = pd.read_csv("most_frequent_severity_with_def_text.csv")
severity_to_int = {"Low": 0, "Medium": 1, "High": 2}
df['Most_Frequent_Severity'] = df['Most_Frequent_Severity'].map(severity_to_int)
print(df.head())
new_df = df[['def_text', 'Most_Frequent_Severity']].copy()
new_df.head()

   PscInspectionId  deficiency_code  Most_Frequent_Severity  \
0          1622854            11119                       1   
1          1623161            18312                       2   
2          1623383             1124                       0   
3          1623387             3199                       1   
4          1623424            15102                       0   

                                            def_text  
0  PscInspectionId: 1622854\n\nDeficiency/Finding...  
1  PscInspectionId: 1623161\n\nDeficiency/Finding...  
2  PscInspectionId: 1623383\n\nDeficiency/Finding...  
3  PscInspectionId: 1623387\n\nDeficiency/Finding...  
4  PscInspectionId: 1623424\n\nDeficiency/Finding...  


Unnamed: 0,def_text,Most_Frequent_Severity
0,PscInspectionId: 1622854\n\nDeficiency/Finding...,1
1,PscInspectionId: 1623161\n\nDeficiency/Finding...,2
2,PscInspectionId: 1623383\n\nDeficiency/Finding...,0
3,PscInspectionId: 1623387\n\nDeficiency/Finding...,1
4,PscInspectionId: 1623424\n\nDeficiency/Finding...,0


In [169]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [170]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.def_text
        self.targets = self.data.Most_Frequent_Severity
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        self.targets = label_encoder.fit_transform(self.targets)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [171]:
# Creating the dataset and dataloader for the neural network

train_size = 0.5
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (4402, 2)
TRAIN Dataset: (2201, 2)
TEST Dataset: (2201, 2)


In [172]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [173]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [174]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [175]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [176]:
import warnings
warnings.filterwarnings("ignore")
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}, NUMBER: {_}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [177]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.9273450970649719, NUMBER: 0
Epoch: 0, Loss:  1.521700382232666, NUMBER: 5
Epoch: 0, Loss:  0.9547253847122192, NUMBER: 10
Epoch: 0, Loss:  1.2100286483764648, NUMBER: 15
Epoch: 0, Loss:  1.1509758234024048, NUMBER: 20
Epoch: 0, Loss:  1.102487325668335, NUMBER: 25
Epoch: 0, Loss:  1.2332154512405396, NUMBER: 30
Epoch: 0, Loss:  1.4379403591156006, NUMBER: 35
Epoch: 0, Loss:  1.1783998012542725, NUMBER: 40
Epoch: 0, Loss:  1.021070957183838, NUMBER: 45
Epoch: 0, Loss:  1.152592420578003, NUMBER: 50
Epoch: 0, Loss:  1.1175484657287598, NUMBER: 55
Epoch: 0, Loss:  1.1315207481384277, NUMBER: 60
Epoch: 0, Loss:  1.226369857788086, NUMBER: 65
Epoch: 0, Loss:  1.0474326610565186, NUMBER: 70
Epoch: 0, Loss:  1.1256535053253174, NUMBER: 75
Epoch: 0, Loss:  1.0728332996368408, NUMBER: 80
Epoch: 0, Loss:  1.1453204154968262, NUMBER: 85
Epoch: 0, Loss:  1.087594747543335, NUMBER: 90
Epoch: 0, Loss:  1.1227164268493652, NUMBER: 95
Epoch: 0, Loss:  0.9865451455116272, NUMBER: 100

In [200]:
torch.save(model.state_dict(), "model.pth")

In [195]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            print('Number: ', _)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            # print(fin_outputs)
            for i in range(len(fin_outputs)):
                fin_outputs[i] = np.array(fin_outputs[i]).argmax()
            # print(fin_outputs)
            # print(fin_targets)
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    # outputs = np.array(outputs).round()
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Number:  0
Number:  1
Number:  2
Number:  3
Number:  4
Number:  5
Number:  6
Number:  7
Number:  8
Number:  9


KeyboardInterrupt: 

<a id='section07'></a>
### Saving the Trained Model Artifacts for inference

This is the final step in the process of fine tuning the model.

The model and its vocabulary are saved locally. These files are then used in the future to make inference on new inputs of news headlines.

Please remember that a trained neural network is only useful when used in actual inference after its training.

In the lifecycle of an ML projects this is only half the job done. We will leave the inference of these models for some other day.