In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from transformers import BertPreTrainedModel

from transformers import BertModel
import numpy as np
from sklearn.utils import resample


In [None]:
df= pd.read_csv('priority_dataset2.csv',sep=',')
df = df[['Priority', 'Short Description']]
df.head()

Unnamed: 0,Priority,Short Description
0,P3,Include Afrikaans in the list of shipped langu...
1,P3,Include Afrikaans in the language options avai...
2,P3,There are no icons displayed in the address bo...
3,P3,Icons are not present in the search results of...
4,P3,Executing doHelpButton() does not provide assi...


In [None]:
df.rename(columns = {'Priority':'Category', 'Short Description':'Message'}, inplace = True)
df.head()

Unnamed: 0,Category,Message
0,P3,Include Afrikaans in the list of shipped langu...
1,P3,Include Afrikaans in the language options avai...
2,P3,There are no icons displayed in the address bo...
3,P3,Icons are not present in the search results of...
4,P3,Executing doHelpButton() does not provide assi...


In [None]:
df['Category'] = df['Category'].str.strip()

print(df['Category'].unique())

['P3' 'P4' 'P5' 'P1' 'P2']


In [None]:
df['Message'] = df['Message'].astype(str)

In [None]:

df.groupby('Category').describe()


Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
P1,3336,3315,broken links in mozilla firefox,14
P2,3717,3714,Assertion failure: cx->bailExit,2
P3,5010,4988,The login page is unable to be accessed.,2
P4,3829,3732,Include TRANSITION_DOWNLOAD,5
P5,3612,3313,curved edges,13


In [None]:
possible_labels = df["Category"].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'P3': 0, 'P4': 1, 'P5': 2, 'P1': 3, 'P2': 4}

In [None]:
df['label'] = df.Category.replace(label_dict)
df.head()

Unnamed: 0,Category,Message,label
0,P3,Include Afrikaans in the list of shipped langu...,0
1,P3,Include Afrikaans in the language options avai...,0
2,P3,There are no icons displayed in the address bo...,0
3,P3,Icons are not present in the search results of...,0
4,P3,Executing doHelpButton() does not provide assi...,0


In [None]:
sampling = "no"

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df.label.values)
if sampling == "-":
  # Create a DataFrame from the training set for easier manipulation
  train_df = pd.DataFrame({'Index': X_train, 'Label': y_train})

  # Count the instances of each class in the training set
  class_counts = train_df['Label'].value_counts()

  # Find the number of instances in the minority class
  max_class_count = class_counts.max()
  print(max_class_count)

  # Perform undersampling
  undersampled_dfs = []
  for label in class_counts.index:
      # Sample min_class_count instances from each class
      label_df = train_df[train_df['Label'] == label].sample( max_class_count, replace=True,random_state=42)
      undersampled_dfs.append(label_df)

  # Concatenate the undersampled DataFrames
  undersampled_train_df = pd.concat(undersampled_dfs)

  # Extract the undersampled X_train and y_train
  X_train = undersampled_train_df['Index'].values
  y_train = undersampled_train_df['Label'].values
elif sampling == "u":
  # Create a DataFrame from the training set for easier manipulation
  train_df = pd.DataFrame({'Index': X_train, 'Label': y_train})

  # Count the instances of each class in the training set
  class_counts = train_df['Label'].value_counts()

  # Find the number of instances in the minority class
  min_class_count = class_counts.min()

  # Perform undersampling
  undersampled_dfs = []
  for label in class_counts.index:
      # Sample min_class_count instances from each class
      label_df = train_df[train_df['Label'] == label].sample(min_class_count, random_state=42)
      undersampled_dfs.append(label_df)

  # Concatenate the undersampled DataFrames
  undersampled_train_df = pd.concat(undersampled_dfs)

  # Extract the undersampled X_train and y_train
  X_train = undersampled_train_df['Index'].values
  y_train = undersampled_train_df['Label'].values

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
if sampling == "o":
  train_df = df[df['data_type'] == 'train']
  val_df = df[df['data_type'] == 'val']

  # Count the instances of each class in the training set
  class_counts = train_df['label'].value_counts()

  # Find the number of instances in the majority class
  max_class_count = max(class_counts)
  print(class_counts, max_class_count)
  print("max", max_class_count)

  # Perform oversampling
  oversampled_dfs = []
  for label in class_counts.index:
      label_df = train_df[train_df['label'] == label]
      # Resample the minority classes to match the size of the majority class
      label_df_oversampled = resample(label_df,
                                      replace=True,  # sample with replacement
                                      n_samples=max_class_count,  # to match majority class
                                      random_state=42)
      oversampled_dfs.append(label_df_oversampled)

  # Concatenate the oversampled DataFrames
  oversampled_train_df = pd.concat(oversampled_dfs)

  # Shuffle the DataFrame to prevent the model from learning the order of training
  oversampled_train_df = oversampled_train_df.sample(frac=1, random_state=42)

  # Combine the oversampled training data back with the validation data
  df_oversampled = pd.concat([oversampled_train_df, val_df])

  # Optional: Re-index the DataFrame
  df_oversampled.reset_index(drop=True, inplace=True)
  df = df_oversampled

df.groupby(['Message', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Category
Message,label,data_type,Unnamed: 3_level_1
nsDTDContext::Pop' needs to be rephrased.,2,train,1
!JS_HAS_XML_SUPPORT should not exclude JSOP_CALLPROP from the interpreter.,0,val,1
"""\nThe ""Application"" category in the Applications preferences panel is experiencing issues.",0,train,1
"""\nThe URI is altered by the newChannel function in FeedProtocolHandler.",1,train,1
""" Add a new query"" should be renamed",4,train,1
...,...,...,...
|failed in test_removeDataFromDomain.js on (NS_ERROR_FILE_NOT_FOUND) [nsICacheService.evictEntries] nsPrivateBrowsingService.js :: PBS_removeDataFromDomain :: line 296|,0,train,1
"|typeof (0, undef)| does not throw ReferenceError",0,train,1
و Many nsIContent::List implementations have excessive assertions,2,val,1
“Open All in Tabs” option does not open “Favorites Toolbar” folder,4,train,1


In [None]:
df.head()

Unnamed: 0,Category,Message,label,data_type
0,P3,Include Afrikaans in the list of shipped langu...,0,train
1,P3,Include Afrikaans in the language options avai...,0,train
2,P3,There are no icons displayed in the address bo...,0,train
3,P3,Icons are not present in the search results of...,0,val
4,P3,Executing doHelpButton() does not provide assi...,0,val


In [None]:
len(X_train)

16578

In [None]:
td = df[df["data_type"] == "train"]
#td.head()
td.groupby('label').describe()

Unnamed: 0_level_0,Category,Category,Category,Category,Message,Message,Message,Message,data_type,data_type,data_type,data_type
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,4258,1,P3,4258,4258,4242,"Include a ""Revert to Saved"" option in the scra...",2,4258,1,train,4258
1,3255,1,P4,3255,3255,3186,Transfer the gfxSparseBitSet code to xpcom.,4,3255,1,train,3255
2,3070,1,P5,3070,3070,2840,Unable to access Thunderbird.,12,3070,1,train,3070
3,2836,1,P1,2836,2836,2819,broken links in mozilla firefox,12,2836,1,train,2836
4,3159,1,P2,3159,3159,3156,unexpected horizontal line repaint issue in pu...,2,3159,1,train,3159


In [None]:
w = td['label'].value_counts(sort=False).to_list()
print(w)

[4258, 3255, 3070, 2836, 3159]


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Message.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Message.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    t_pred = 0
    t_true = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
        print(f'Accuracy %: {((len(y_preds[y_preds==label]))/(len(y_true)))*100}%\n')
        t_pred += len(y_preds[y_preds==label])
        t_true += len(y_true)
    print(f"Total Accuracy: {t_pred}/{t_true}" )
    print(f'Total Accuracy: {(t_pred/t_true)*100}%\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
from torch.nn import CrossEntropyLoss


def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/5526 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.1244334914744132
Validation loss: 0.9036192150856177
F1 Score (Weighted): 0.5560266146596105


Epoch 2:   0%|          | 0/5526 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.778648940730625
Validation loss: 0.805639819563574
F1 Score (Weighted): 0.684797654292


Epoch 3:   0%|          | 0/5526 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5988542070804693
Validation loss: 1.0191875936782215
F1 Score (Weighted): 0.689251584271966


Epoch 4:   0%|          | 0/5526 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.5101379937613364
Validation loss: 1.306250782900026
F1 Score (Weighted): 0.703126640160794


Epoch 5:   0%|          | 0/5526 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.4093695129971681
Validation loss: 1.488117015238589
F1 Score (Weighted): 0.7170262021578827


In [None]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
print(f"F1 score: {f1_score_func(predictions, true_vals)}")
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score: 0.7170262021578827
Class: P3
Accuracy: 518/752
Accuracy %: 68.88297872340425%

Class: P4
Accuracy: 515/574
Accuracy %: 89.72125435540069%

Class: P5
Accuracy: 521/542
Accuracy %: 96.12546125461255%

Class: P1
Accuracy: 221/500
Accuracy %: 44.2%

Class: P2
Accuracy: 322/558
Accuracy %: 57.70609318996416%

Total Accuracy: 2097/2926
Total Accuracy: 71.66780587833219%



In [None]:
!pip install torchinfo
import torch
from torchinfo import summary
from transformers import BertForSequenceClassification

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Create a dummy input tensor of the correct type (Long tensor)
# The size is [batch_size, sequence_length]
dummy_input = torch.LongTensor(1, 512).fill_(1)  # Filling with ones (or any token ID)

# Use the dummy input tensor in the summary
summary(model, input_data=dummy_input)

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                  Output Shape              Param #
BertForSequenceClassification                           [1, 2]                    --
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 512, 768]             --
│    │    └─Embedding: 3-1                              [1, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                [1, 512, 768]             --
│    └─BertEncoder: 2-2                                 [1, 512, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           

In [None]:
model.num_parameters()

109483778

In [None]:
class MyBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, weights=None):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.weights=weights
        self.init_weights()

    #@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    #@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                if self.weights:
                  class_counts = self.weights
                  class_weights = [1.0 / x for x in class_counts]
                  class_weights_normalized = [x / sum(class_weights) for x in class_weights]

                  weights = torch.tensor(class_weights_normalized).cuda()
                  loss_fct = CrossEntropyLoss(weight=weights)
                else:
                  loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits,


In [None]:
import torch.nn as nn
model = MyBertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      weights=w
                                                      )

Some weights of MyBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

from torchinfo import summary
dummy_input = torch.LongTensor(1, 512).fill_(1)  # Filling with ones (or any token ID)

# Use the dummy input tensor in the summary
summary(model, input_data=dummy_input)

Layer (type:depth-idx)                                  Output Shape              Param #
MyBertForSequenceClassification                         [1, 5]                    --
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 512, 768]             --
│    │    └─Embedding: 3-1                              [1, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                [1, 512, 768]             --
│    └─BertEncoder: 2-2                                 [1, 512, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           