In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from transformers import BertPreTrainedModel

from transformers import BertModel
import numpy as np
from sklearn.utils import resample


In [None]:
df= pd.read_csv('your_dataset.csv',sep=',')
df = df[['Classification', 'Summary']]
df.head()

Unnamed: 0,Classification,Summary
0,Add issue,Documentation comment for getVariable/getParam...
1,Add issue,svn status' needs better error reporting
2,Add issue,Multolingual support in converting xml - pdf
3,Add issue,Velocity should provide an interface for expli...
4,Add issue,Translate svn-ref.tex to French


In [None]:
df.rename(columns = {'Classification':'Category', 'Summary':'Message'}, inplace = True)
df.head()

Unnamed: 0,Category,Message
0,Add issue,Documentation comment for getVariable/getParam...
1,Add issue,svn status' needs better error reporting
2,Add issue,Multolingual support in converting xml - pdf
3,Add issue,Velocity should provide an interface for expli...
4,Add issue,Translate svn-ref.tex to French


In [None]:
df['Category'] = df['Category'].str.strip()

print(df['Category'].unique())

['Add issue' 'Configuration issue' 'Database-related issue'
 'Functional issue' 'GUI-related issue' 'info release issue'
 'Network issue' 'Performance issue' 'Permission/Deprecation issue'
 'Security issue' 'Test Code-related issue']


In [None]:
# droping some columns
df = df[df['Category'] != 'info release issue']
# Define the list of values to check
values_to_replace = ["Network issue", "Permission/Deprecation issue", "Security issue"]

# Replace values in 'Category' column
df.loc[df['Category'].isin(values_to_replace), 'Category'] = "Network/Security Issue"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['Category'].isin(values_to_replace), 'Category'] = "Network/Security Issue"


In [None]:

df.groupby('Category').describe()


Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Add issue,123,123,Documentation comment for getVariable/getParam...,1
Configuration issue,188,186,review: Importxml will fail if versioncache ne...,2
Database-related issue,35,34,review: Remembered query options need regroupi...,2
Functional issue,471,470,Dynamic discovery not working properly,2
GUI-related issue,197,194,"review: Turn ""all selected"" into ""none selecte...",2
Network/Security Issue,127,114,Security access to server and database,3
Performance issue,42,42,"""svn up"" fails if too much svndiff data received",1
Test Code-related issue,79,76,TEST,2


In [None]:
possible_labels = df["Category"].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Add issue': 0,
 'Configuration issue': 1,
 'Database-related issue': 2,
 'Functional issue': 3,
 'GUI-related issue': 4,
 'Network/Security Issue': 5,
 'Performance issue': 6,
 'Test Code-related issue': 7}

In [None]:
df['label'] = df.Category.replace(label_dict)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df.Category.replace(label_dict)


Unnamed: 0,Category,Message,label
0,Add issue,The documentation comment for getVariable/getP...,0
1,Add issue,The documentation comment for the getVariable/...,0
2,Add issue,The documentation comment for getVariable/getP...,0
3,Add issue,"""Improper documentation comment for the functi...",0
4,Add issue,The error reporting for the command 'svn statu...,0


In [None]:
sampling = "l"

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df.label.values)
if sampling == "-":
  # Create a DataFrame from the training set for easier manipulation
  train_df = pd.DataFrame({'Index': X_train, 'Label': y_train})

  # Count the instances of each class in the training set
  class_counts = train_df['Label'].value_counts()

  # Find the number of instances in the minority class
  max_class_count = class_counts.max()
  print(max_class_count)

  # Perform undersampling
  undersampled_dfs = []
  for label in class_counts.index:
      # Sample min_class_count instances from each class
      label_df = train_df[train_df['Label'] == label].sample( max_class_count, replace=True,random_state=42)
      undersampled_dfs.append(label_df)

  # Concatenate the undersampled DataFrames
  undersampled_train_df = pd.concat(undersampled_dfs)

  # Extract the undersampled X_train and y_train
  X_train = undersampled_train_df['Index'].values
  y_train = undersampled_train_df['Label'].values
elif sampling == "u":
  # Create a DataFrame from the training set for easier manipulation
  train_df = pd.DataFrame({'Index': X_train, 'Label': y_train})

  # Count the instances of each class in the training set
  class_counts = train_df['Label'].value_counts()

  # Find the number of instances in the minority class
  min_class_count = class_counts.min()

  # Perform undersampling
  undersampled_dfs = []
  for label in class_counts.index:
      # Sample min_class_count instances from each class
      label_df = train_df[train_df['Label'] == label].sample(min_class_count, random_state=42)
      undersampled_dfs.append(label_df)

  # Concatenate the undersampled DataFrames
  undersampled_train_df = pd.concat(undersampled_dfs)

  # Extract the undersampled X_train and y_train
  X_train = undersampled_train_df['Index'].values
  y_train = undersampled_train_df['Label'].values

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
if sampling == "o":
  train_df = df[df['data_type'] == 'train']
  val_df = df[df['data_type'] == 'val']

  # Count the instances of each class in the training set
  class_counts = train_df['label'].value_counts()

  # Find the number of instances in the majority class
  max_class_count = max(class_counts)
  print(class_counts, max_class_count)
  print("max", max_class_count)

  # Perform oversampling
  oversampled_dfs = []
  for label in class_counts.index:
      label_df = train_df[train_df['label'] == label]
      # Resample the minority classes to match the size of the majority class
      label_df_oversampled = resample(label_df,
                                      replace=True,  # sample with replacement
                                      n_samples=max_class_count,  # to match majority class
                                      random_state=42)
      oversampled_dfs.append(label_df_oversampled)

  # Concatenate the oversampled DataFrames
  oversampled_train_df = pd.concat(oversampled_dfs)

  # Shuffle the DataFrame to prevent the model from learning the order of training
  oversampled_train_df = oversampled_train_df.sample(frac=1, random_state=42)

  # Combine the oversampled training data back with the validation data
  df_oversampled = pd.concat([oversampled_train_df, val_df])

  # Optional: Re-index the DataFrame
  df_oversampled.reset_index(drop=True, inplace=True)
  df = df_oversampled

df.groupby(['Message', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Category
Message,label,data_type,Unnamed: 3_level_1
\n\nProblems with class path after importing ejb jar to designated server,4,train,1
"""Accounts containing a period symbol.""",1,train,1
"""Adapt to the modifications in the OpenSSL 0.9.7g API.""",1,train,1
"""After producing numerous pdfs, an OutOfMemoryError has occurred.""",5,train,1
"""Allow typically vacant inline elements (such as named anchors) to be visible in the editor.""",3,train,1
...,...,...,...
xDebug over an SSH Tunnel is not functioning properly for debugging purposes.,4,train,1
xerces2b3: additional line ending character is included in the smoketest api tests.,4,train,1
xerces2b3: smoketest api tests add &#13,4,train,1
z/OS RAC ASCII/EBCDIC translation problem,7,train,1


In [None]:
df.head()

Unnamed: 0,Category,Message,label,data_type
0,Add issue,The documentation comment for getVariable/getP...,0,train
1,Add issue,The documentation comment for the getVariable/...,0,val
2,Add issue,The documentation comment for getVariable/getP...,0,train
3,Add issue,"""Improper documentation comment for the functi...",0,train
4,Add issue,The error reporting for the command 'svn statu...,0,train


In [None]:
len(X_train)

4086

In [None]:
td = df[df["data_type"] == "train"]
#td.head()
td.groupby('label').describe()

Unnamed: 0_level_0,Category,Category,Category,Category,Message,Message,Message,Message,data_type,data_type,data_type,data_type
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,523,1,Add issue,523,523,502,review: Add support for Insiders,3,523,1,train,523
1,639,1,Configuration issue,639,639,619,Makefiles do not have support for shared libra...,3,639,1,train,639
2,416,1,Database-related issue,416,416,393,The SQL task is not compatible with Informix I...,5,416,1,train,416
3,670,1,GUI-related issue,670,670,656,The origin does not appear during the debuggin...,2,670,1,train,670
4,540,1,Network/Security Issue,540,540,521,Remove the outdated org.eclipse.aether.spi.con...,3,540,1,train,540
5,428,1,Performance issue,428,428,414,Evaluation of Necko's Performance,4,428,1,train,428
6,470,1,Test Code-related issue,470,470,448,[Regression,6,470,1,train,470
7,400,1,Functional issue,400,400,398,review: Remove hard-coded titles and things,2,400,1,train,400


In [None]:
w = td['label'].value_counts(sort=False).to_list()
print(w)

[523, 639, 416, 670, 540, 428, 470, 400]


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Message.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Message.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)




In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    t_pred = 0
    t_true = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
        print(f'Accuracy %: {((len(y_preds[y_preds==label]))/(len(y_true)))*100}%\n')
        t_pred += len(y_preds[y_preds==label])
        t_true += len(y_true)
    print(f"Total Accuracy: {t_pred}/{t_true}" )
    print(f'Total Accuracy: {(t_pred/t_true)*100}%\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
from torch.nn import CrossEntropyLoss


def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.3549128912783142
Validation loss: 0.815141707859717
F1 Score (Weighted): 0.7373639231229949


Epoch 2:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5062736605274218
Validation loss: 0.6234994414677898
F1 Score (Weighted): 0.8359156637352388


Epoch 3:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.24506051273723836
Validation loss: 0.5932540651041284
F1 Score (Weighted): 0.8811642675758634


Epoch 4:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.0934046469483078
Validation loss: 0.5500360078652219
F1 Score (Weighted): 0.9107774881261362


Epoch 5:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.04100486786211068
Validation loss: 0.6013928794336343
F1 Score (Weighted): 0.916805298187738


Epoch 6:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.02796481777163195
Validation loss: 0.6720080721047207
F1 Score (Weighted): 0.9015131016780464


Epoch 7:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.01607153522735124
Validation loss: 0.659748777062067
F1 Score (Weighted): 0.9118108559424203


Epoch 8:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.011653860410134656
Validation loss: 0.7022122499570796
F1 Score (Weighted): 0.9067471749097237


Epoch 9:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.011274277960351376
Validation loss: 0.6604709235867495
F1 Score (Weighted): 0.9162543292356561


Epoch 10:   0%|          | 0/1362 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.00548381282004893
Validation loss: 0.6963825166818741
F1 Score (Weighted): 0.9134821262931541


In [None]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
print(f"F1 score: {f1_score_func(predictions, true_vals)}")
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


F1 score: 0.916805298187738
Class: Add issue
Accuracy: 87/92
Accuracy %: 94.56521739130434%

Class: Configuration issue
Accuracy: 105/113
Accuracy %: 92.92035398230088%

Class: Database-related issue
Accuracy: 74/74
Accuracy %: 100.0%

Class: GUI-related issue
Accuracy: 105/118
Accuracy %: 88.98305084745762%

Class: Network/Security Issue
Accuracy: 92/95
Accuracy %: 96.84210526315789%

Class: Performance issue
Accuracy: 74/76
Accuracy %: 97.36842105263158%

Class: Test Code-related issue
Accuracy: 79/83
Accuracy %: 95.18072289156626%

Class: Functional issue
Accuracy: 47/71
Accuracy %: 66.19718309859155%

Total Accuracy: 663/722
Total Accuracy: 91.82825484764543%



In [None]:
!pip install torchinfo
import torch
from torchinfo import summary
from transformers import BertForSequenceClassification

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Create a dummy input tensor of the correct type (Long tensor)
# The size is [batch_size, sequence_length]
dummy_input = torch.LongTensor(1, 512).fill_(1)  # Filling with ones (or any token ID)

# Use the dummy input tensor in the summary
summary(model, input_data=dummy_input)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                  Output Shape              Param #
BertForSequenceClassification                           [1, 2]                    --
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 512, 768]             --
│    │    └─Embedding: 3-1                              [1, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                [1, 512, 768]             --
│    └─BertEncoder: 2-2                                 [1, 512, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           

In [None]:
model.num_parameters()

109483778

In [None]:
class MyBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, weights=None):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.weights=weights
        self.init_weights()

    #@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    #@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                if self.weights:
                  class_counts = self.weights
                  class_weights = [1.0 / x for x in class_counts]
                  class_weights_normalized = [x / sum(class_weights) for x in class_weights]

                  weights = torch.tensor(class_weights_normalized).cuda()
                  loss_fct = CrossEntropyLoss(weight=weights)
                else:
                  loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits,


IndentationError: ignored

In [None]:
import torch.nn as nn
model = MyBertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      weights=w
                                                      )

Some weights of MyBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

from torchinfo import summary
dummy_input = torch.LongTensor(1, 512).fill_(1)  # Filling with ones (or any token ID)

# Use the dummy input tensor in the summary
summary(model, input_data=dummy_input)

Layer (type:depth-idx)                                  Output Shape              Param #
MyBertForSequenceClassification                         [1, 8]                    --
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 512, 768]             --
│    │    └─Embedding: 3-1                              [1, 512, 768]             23,440,896
│    │    └─Embedding: 3-2                              [1, 512, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                [1, 512, 768]             --
│    └─BertEncoder: 2-2                                 [1, 512, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           