In [1]:
import multimodal_transformers

In [2]:
import transformers
transformers.__version__

'4.1.0'

In [3]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
# from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'

In [4]:
A_path = ''
B_path = 'data.csv'

In [5]:
data_df = pd.read_csv(B_path, index_col=0)
data_df = data_df[data_df['user_score'] != 1]
data_df

Unnamed: 0,practice_no,explanation_practice,user_score,applied,length,lemmatized,num_ORG,num_LOC,num_PER,num_MISC,...,executive directors,board effectiveness,director board,contribution performance,remuneration committee,result assessment,performance evaluation,composition board,evaluation process,company secretary
0,B,The Nomination Committee carried out annual as...,3,1,83,the Nomination Committee carry out annual asse...,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B,The Audit and Risk Management Committee had a ...,2,1,52,the Audit and Risk Management Committee have a...,7,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,B,The Company conducts its Board Effectiveness E...,2,1,103,the Company conduct its Board Effectiveness Ev...,8,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,B,An assessment on the effectiveness of the Boar...,2,1,61,an assessment on the effectiveness of the Boar...,5,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,B,The Board with assistance from the Nominating ...,4,1,675,the Board with assistance from the Nominating ...,41,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491,B,The Group recognises the importance of identif...,0,1,101,the Group recognise the importance of identify...,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2492,B,The appointment of any new member to the Board...,0,1,42,the appointment of any new member to the Board...,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2493,B,The Board together with the senior management ...,0,1,67,the Board together with the senior management ...,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2494,B,The banana management uses a potato.,0,1,6,the banana management use a potato .,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data_df['applied'] = data_df['applied'].apply(lambda x: 'Applied' if 1 else 'Not Applied')

In [7]:
from sklearn.model_selection import train_test_split
B_train, B_test = train_test_split(data_df, stratify=data_df['user_score'], shuffle=True, random_state=10, test_size=0.2)
# B_train.reset_index(drop=True, inplace=True)
# B_test.reset_index(drop=True, inplace=True)
B_train.to_csv('train.csv')
B_test.to_csv('test.csv')

In [8]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )


@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

In [9]:
from transformers import TrainingArguments

In [10]:
num_cols = ['num_ORG', 'num_GPE', 'num_FAC', 'num_PRODUCT', 'num_EVENT', 'num_LAW', 'num_WORDS']
cat_cols = ['committee member', 'self peer', 'year end',
       'evaluation form', 'board meeting', 'risk management', 'board member',
       'board evaluation', 'independent non', 'time commitment',
       'corporate governance', 'executive directors', 'board effectiveness',
       'remuneration committee', 'performance evaluation', 'composition board',
       'evaluation process', 'company secretary']

In [11]:
text_cols = ['explanation_practice']

column_info_dict = {
    'text_cols': text_cols,
    'num_cols': num_cols,
    'cat_cols': cat_cols,
    'label_col': 'user_score',
    'label_list': [0, 1, 2, 3, 4]
}


model_args = ModelArguments(
    model_name_or_path='allenai/longformer-base-4096'
)

data_args = MultimodalDataTrainingArguments(
    data_path='.',
    combine_feat_method='attention_on_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='classification'
)

In [12]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
    max_sequence_length=1024
)

Specified tokenizer:  allenai/longformer-base-4096


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

glove_tokenizer = Tokenizer(num_words=10000)
glove_tokenizer.fit_on_texts(data_df['explanation_practice'].to_list())

In [14]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=data_args.column_info['cat_cols'],
    numerical_cols=data_args.column_info['num_cols'],
    sep_text_token_str=tokenizer.sep_token,
    max_token_length=1024,
    glove_tokenizer=glove_tokenizer,
    keywords=['independent sources were used', 'an evaluation was carried out'],
    max_keyword_length=10
)
train_dataset

INFO:multimodal_transformers.data.data_utils:7 numerical columns
INFO:multimodal_transformers.data.data_utils:36 categorical columns
INFO:multimodal_transformers.data.data_utils:7 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['explanation_practice']


Original df shape
(2495, 41)
Int64Index([ 682, 1861,  566, 2190, 2000, 1398, 2379, 1470, 1956,  829,
            ...
             853, 1205, 2090,  834,  727, 2423, 1488, 1060, 1299, 1360],
           dtype='int64', length=2495)
682     The NC adopts and conducts on an annual basis ...
1861    During the financial year under review, the NC...
566     The Nomination Committee is responsible for th...
2190    The Nomination Committee would conduct annual ...
2000    The INEDs are assessed annually by the NC on b...
                              ...                        
2423    1 The Nominating and Remuneration Committee NR...
1488    In making recommendations and performing its a...
1060    In 2018, the NC undertook the evaluation to as...
1299    A formal evaluation process is in place to ass...
1360    The Nomination Committee assesses the effectiv...
Name: explanation_practice, Length: 2495, dtype: object
Cat df shape
(2495, 36)
RangeIndex(start=0, stop=2495, step=1)
      committe

INFO:multimodal_transformers.data.load_data:Raw text example: The NC adopts and conducts on an annual basis the following evaluation:- Board and Board Committee Evaluation; Internal Audit Function Evaluation; External Audit or Performance and Independence Evaluation; Independent Directors' Self-Assessment; Directors and Key officers' Evaluation; The Audit Committee Evaluation; and The Internal Audit Function Evaluation. All assessments and evaluations carried out by the NC are properly documented. The Company Secretary summarizes and compiles the assessments with comments by the Directors. The summaries are tabled at the NC meeting where the NC will go through and discuss the assessment and evaluation. The NC Chairman will then report to the Board on the results of the Directors' assessment and evaluation, and any recommendations of improvement.


<keras_preprocessing.text.Tokenizer object at 0x7fcdd53105e0>


INFO:multimodal_transformers.data.data_utils:36 categorical columns
INFO:multimodal_transformers.data.data_utils:7 numerical columns
INFO:multimodal_transformers.data.load_data:Text columns: ['explanation_practice']
INFO:multimodal_transformers.data.load_data:Raw text example: The Nomination Committee undertakes annual assessment to evaluate the performance of each individual Directors, the effectiveness of the Board and the Board Committees by way of self-assessment. Directors are required to fill out the self-assessment forms and provide their feedback, views and suggestions for improvement. The results of these self-assessment forms are compiled and tabled to the Nominating Committee for review and deliberation. Based on the assessment, the Board is satisfied with the overall performance effectiveness of the Board, Board Committees and individual directors and the independence of Independent Directors.


tensor([[1.0000e+00, 3.1000e+01, 1.3670e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [7.1000e+01, 1.0000e+00, 5.3000e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0000e+00, 3.2000e+01, 1.0000e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.0000e+00, 8.3000e+01, 1.0000e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0000e+00, 1.1000e+01, 3.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0000e+00, 4.0000e+00, 5.4000e+01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])
Data df last time
(499, 77)
RangeIndex(start=1996, stop=2495, step=1)
     practice_no                               explanation_practice  \
1996           B  The Nomination Committee undertakes annual ass...   
1997           B  In FY 2018, the Nominating Committee had carri...   
1998           B  The Board through the NC to perform a formal a...   
1999           B  The Terms of Reference of Nomina

<multimodal_transformers.data.tabular_torch_dataset.TorchTabularTextDataset at 0x7fcdd531f880>

In [15]:
import torch
test_tensor = torch.tensor([  1,   4, 254,   0,   0,   0])
mask = torch.zeros(test_tensor.shape, dtype=torch.long)
mask.masked_fill_(test_tensor != 0, 1)

tensor([1, 1, 1, 0, 0, 0])

In [16]:
mask

tensor([1, 1, 1, 0, 0, 0])

In [17]:
test_dataset.labels

array([2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 2, 2, 3, 3, 0, 2, 2, 3, 4, 2, 3, 4,
       2, 2, 3, 2, 4, 2, 3, 3, 3, 3, 2, 3, 2, 4, 2, 3, 3, 2, 3, 4, 3, 3,
       3, 3, 2, 3, 4, 2, 2, 2, 3, 2, 4, 2, 2, 2, 2, 2, 4, 3, 2, 3, 4, 2,
       2, 4, 3, 3, 2, 4, 3, 4, 3, 3, 4, 3, 2, 4, 3, 2, 4, 3, 3, 2, 2, 2,
       2, 3, 3, 3, 3, 2, 2, 3, 2, 3, 4, 2, 2, 2, 3, 2, 2, 2, 4, 2, 3, 4,
       3, 3, 4, 2, 2, 2, 3, 3, 4, 2, 2, 3, 3, 2, 2, 2, 2, 4, 3, 3, 2, 4,
       3, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 3,
       3, 2, 2, 2, 3, 2, 3, 3, 2, 4, 3, 3, 4, 3, 2, 2, 3, 3, 3, 2, 4, 3,
       2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 2, 3, 3, 3, 2, 4, 3, 0, 2, 2, 3, 3,
       3, 4, 3, 2, 2, 3, 4, 3, 2, 4, 2, 3, 4, 2, 2, 2, 2, 4, 3, 0, 4, 2,
       2, 2, 3, 3, 2, 3, 2, 3, 4, 3, 3, 2, 3, 3, 2, 4, 2, 3, 3, 4, 4, 4,
       3, 2, 2, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 2, 4, 2, 3, 3, 3,
       3, 2, 3, 2, 3, 3, 4, 2, 3, 4, 3, 3, 4, 3, 3, 2, 2, 2, 4, 2, 4, 2,
       2, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2,

In [18]:
import torch
labels = []
for i in range(len(test_dataset)):
  if(torch.isnan(test_dataset[i]['labels'])):
    print(tokenizer.convert_tokens_to_string(test_dataset[i]['input_ids']))

  item['answer_mask'] = torch.tensor(self.answer_mask[idx])
  item['keyword_tokens'] = torch.tensor(self.keyword_tokens)
  item['keyword_mask'] = torch.tensor(self.keyword_mask)


In [19]:
glove_path = '/Users/SidharrthNagappan/Documents/University/Second Year/FYP/large_files/glove.6B.300d.txt'
glove_embeddings = {}

for line in open(glove_path):
    temp = line.split()
    glove_embeddings[temp[0]] = np.asarray([float(i) for i in temp[1:]])

print(glove_embeddings['the'])

[ 4.6560e-02  2.1318e-01 -7.4364e-03 -4.5854e-01 -3.5639e-02  2.3643e-01
 -2.8836e-01  2.1521e-01 -1.3486e-01 -1.6413e+00 -2.6091e-01  3.2434e-02
  5.6621e-02 -4.3296e-02 -2.1672e-02  2.2476e-01 -7.5129e-02 -6.7018e-02
 -1.4247e-01  3.8825e-02 -1.8951e-01  2.9977e-01  3.9305e-01  1.7887e-01
 -1.7343e-01 -2.1178e-01  2.3617e-01 -6.3681e-02 -4.2318e-01 -1.1661e-01
  9.3754e-02  1.7296e-01 -3.3073e-01  4.9112e-01 -6.8995e-01 -9.2462e-02
  2.4742e-01 -1.7991e-01  9.7908e-02  8.3118e-02  1.5299e-01 -2.7276e-01
 -3.8934e-02  5.4453e-01  5.3737e-01  2.9105e-01 -7.3514e-03  4.7880e-02
 -4.0760e-01 -2.6759e-02  1.7919e-01  1.0977e-02 -1.0963e-01 -2.6395e-01
  7.3990e-02  2.6236e-01 -1.5080e-01  3.4623e-01  2.5758e-01  1.1971e-01
 -3.7135e-02 -7.1593e-02  4.3898e-01 -4.0764e-02  1.6425e-02 -4.4640e-01
  1.7197e-01  4.6246e-02  5.8639e-02  4.1499e-02  5.3948e-01  5.2495e-01
  1.1361e-01 -4.8315e-02 -3.6385e-01  1.8704e-01  9.2761e-02 -1.1129e-01
 -4.2085e-01  1.3992e-01 -3.9338e-01 -6.7945e-02  1

In [20]:
EMBEDDING_DIM = 300
MAX_WORDS = 6000
word_index = glove_tokenizer.word_index
embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM))

for word,i in word_index.items():
    if(i>=len(word_index)):
        continue
    if word in glove_embeddings:
        embedding_matrix[i]=glove_embeddings[word]
    
vocab_size = len(word_index)
print(vocab_size)

5161


In [21]:
np.save('embedding_matrix.npy', embedding_matrix)

In [22]:
train_dataset.cat_feats.shape

(1996, 36)

In [23]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=5,
                               cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                            #    keyword_attention_dim=train_dataset.keyword_tokens.shape[1],
                               vocab_size=vocab_size,
                               **vars(data_args))
config.tabular_config = tabular_config

In [24]:
config.model_type

'longformer'

In [25]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
)

Cat feat dim
36
Numerical feat dim
7
Output dim outside
768
Output dim num
7
Output dim
768


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerWithTabular: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerWithTabular were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias',

In [26]:
model

LongformerWithTabular(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Li

In [27]:
model.embedding_layer.weight = torch.nn.Parameter(torch.from_numpy(embedding_matrix))

In [28]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
    cohen_kappa_score,
    precision_recall_fscore_support,
    accuracy_score
)

GLOBAL_PREDICTIONS = None

def calc_classification_metrics(p: EvalPrediction):
#   pred_labels = np.argmax(p.predictions[0], axis=1)
#   pred_scores = softmax(p.predictions[0], axis=1)
#   labels = p.label_ids
#   print(pred_labels)
#   print(labels)
#   acc = (pred_labels == labels).mean()
#   f1 = f1_score(y_true=labels, y_pred=pred_labels, average='weighted')
#   result = {
#       "acc": acc,
#       "f1": f1,
#       "acc_and_f1": (acc + f1) / 2,
#       "mcc": matthews_corrcoef(labels, pred_labels),
#       "QWK": cohen_kappa_score(labels, pred_labels, weights='quadratic')
#   }
  result = {
      "acc": 0,
      "f1": 0,
      "acc_and_f1": 0,
      "mcc": 0,
      "QWK": 0
  }
  return result

In [29]:
training_args = TrainingArguments(
    output_dir = './trained-models/B+features+keyword-attention',
    num_train_epochs = 1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size=2,
    evaluation_strategy = "epoch",
    # save_strategy="epoch",
    disable_tqdm = False,
    load_best_model_at_end=True,
    # warmup_steps=160,
    # weight_decay=0.01,
    # learning_rate = 1e-5,
    logging_steps=1,
    # fp16 = True,
    # logging_dir='/media/data_files/github/website_tutorials/logs',
    # dataloader_num_workers = 0,
    run_name = 'longformer-B'
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=calc_classification_metrics,
)

In [31]:
train_dataset[15]

{'input_ids': tensor([    0,   133, 11276,  ...,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor(2),
 'cat_feats': tensor([1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0.,
         1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.]),
 'numerical_feats': tensor([-0.1081, -5.1993, -5.1993, -5.1993, -5.1993, -5.1993, -0.7104]),
 'answer_tokens': tensor([ 1, 32, 10,  ...,  0,  0,  0], dtype=torch.int32),
 'answer_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'keyword_tokens': tensor([[   0,    0,    0,    0,    0,    0,   20, 1745,   39,  167],
         [   0,    0,    0,    0,    0,   34,   11,   30,   63,   35]],
        dtype=torch.int32),
 'keyword_mask': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])}

### 2 epochs

In [32]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/62 [00:00<?, ?it/s]

Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
mask_ans_inf.shape torch.Size([2, 1024])
mask_key_inf.shape torch.Size([2, 10])
mask_ans_inf tensor([[     0,      0,      0,  ..., -10000, -10000, -10000],
        [     0,      0,      0,  ..., -10000, -10000, -10000]])
mask_key_inf tensor([[   -10000,    -10000,    -10000,    -10000,    -10000,    -10000,
           -190000, -17440000,   -380000,  -1660000],
        [   -10000,    -10000,    -10000,    -10000,    -10000,    -10000,
           -190000, -17440000,   -380000,  -1660000]], dtype=torch.int32)
After unsqueeze
mask_ans_inf_1.shape torch.Size([2, 1024])
mask_key_inf_1.shape torch.Size([2, 10])
mask_ans_inf_1 tensor([[     0,      0,      0,  ..., -10000, -10000, -10000],
        [     0,      0,      0,  ..., -10000, -10000, -10000]])
mask_key_inf_1 tensor([[   -10000,    -10000,    -10000,    -10000,    -1

KeyboardInterrupt: 

In [34]:
cat1 = torch.rand((2, 768))
cat2 = torch.rand((4, 600))

torch.cat((cat1, cat2), dim=1)

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 2 but got size 4 for tensor number 1 in the list.

In [None]:
import tensorflow.keras.backend as K
import tensorflow as tf
    
rand_tensor = tf.random.uniform((2, 10, 1024))
K.max(rand_tensor, axis=2)

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[0.9987265 , 0.9998193 , 0.99476373, 0.9995369 , 0.99994373,
        0.99986553, 0.99895644, 0.99841166, 0.9981859 , 0.9984652 ],
       [0.9994103 , 0.9989102 , 0.99976754, 0.9995568 , 0.9993112 ,
        0.9985832 , 0.99959457, 0.9992548 , 0.9996153 , 0.9984505 ]],
      dtype=float32)>

In [None]:
import torch

rand_tensor2 = torch.rand((2, 10, 1024))
torch.max(rand_tensor2, dim=2)[0]

tensor([[0.9994, 0.9982, 0.9977, 0.9993, 0.9994, 0.9998, 0.9994, 0.9972, 0.9993,
         0.9997],
        [0.9977, 0.9980, 0.9967, 0.9992, 0.9960, 0.9998, 0.9992, 0.9999, 0.9999,
         0.9999]])

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K

rand_tensor_1 = np.random.rand(2, 1024)
rand_tensor_2 = np.random.rand(2, 10)
tensor1 = tf.convert_to_tensor(rand_tensor_1, dtype=tf.float32)
tensor2 = tf.convert_to_tensor(rand_tensor_2, dtype=tf.float32)

K.repeat(tensor1, 10) * K.repeat(tensor2, 1024)
# K.repeat(tensor2, 1024).shape

InvalidArgumentError: Incompatible shapes: [2,10,1024] vs. [2,1024,10] [Op:Mul]

In [85]:
tensor1 = torch.from_numpy(rand_tensor_1)
# tensor2 = torch.rand((2, 10))

tensor1.repeat_interleave(10, dim=0).reshape(2, 10, 1024)
# torch.repeat(tensor1, 10, dim=0).shape

tensor([[[0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576],
         [0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576],
         [0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576],
         ...,
         [0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576],
         [0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576],
         [0.1647, 0.3302, 0.7568,  ..., 0.2531, 0.4503, 0.9576]],

        [[0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410],
         [0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410],
         [0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410],
         ...,
         [0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410],
         [0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410],
         [0.3990, 0.8398, 0.1885,  ..., 0.0899, 0.6009, 0.3410]]],
       dtype=torch.float64)

In [36]:
tensor1.shape

TensorShape([2, 1024])

In [33]:
import tensorflow.keras.backend as K
import torch
tensor1 = torch.rand((2, 10, 300))
tensor2 = torch.rand((2, 1024, 300))
K.batch_dot(tensor1, K.permute_dimensions(tensor2, (0, 2, 1)))

2022-01-05 12:39:50.819384: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(2, 10, 1024), dtype=float32, numpy=
array([[[76.78874 , 69.66727 , 71.11017 , ..., 72.77596 , 70.896645,
         69.03128 ],
        [83.89556 , 78.070724, 75.54849 , ..., 81.36639 , 75.01842 ,
         75.843735],
        [81.94365 , 75.91641 , 76.62925 , ..., 76.86206 , 76.316055,
         74.34951 ],
        ...,
        [73.00943 , 70.148315, 69.54454 , ..., 71.44026 , 67.4173  ,
         68.718414],
        [80.74652 , 74.4663  , 76.92529 , ..., 80.52219 , 75.38301 ,
         75.25845 ],
        [83.40994 , 78.4779  , 81.6537  , ..., 83.143524, 78.69214 ,
         79.012245]],

       [[83.921616, 75.04848 , 78.34143 , ..., 80.3151  , 74.495964,
         79.75101 ],
        [86.26285 , 73.605484, 79.67517 , ..., 79.20491 , 73.89311 ,
         77.3199  ],
        [78.401794, 74.110115, 74.183655, ..., 76.27271 , 71.349396,
         74.19035 ],
        ...,
        [81.52823 , 75.03612 , 77.82643 , ..., 75.73718 , 75.43957 ,
         79.42439 ],
        [73.29507

In [15]:
torch.bmm(tensor1, torch.permute(tensor2, (0, 2, 1))).shape

torch.Size([2, 10, 1024])

In [7]:
tensor2.shape

torch.Size([2, 1024, 300])

In [3]:
import torch

def bdot(a, b):
    B = a.shape[0]
    S = a.shape[1]
    print('a.shape:', a.shape)
    print('b.shape:', b.shape)
    # I removed view from here
    # TODO: check if this is correct
    return torch.bmm(a, b).reshape(-1)

In [42]:
random_tensor = torch.rand((2, 10244))
random_tensor
random_tensor.repeat_interleave(1024, dim=0).shape

torch.Size([2048, 10244])

In [58]:
random_tensor2 = torch.rand((2, 10))
random_tensor2 = torch.repeat_interleave(random_tensor2, 1024, 0)
# random_tensor2 = torch.permute(random_tensor2, (0, 2, 1))
random_tensor2.shape

torch.Size([2048, 10])

In [None]:
model.save_pretrained('/content/drive/MyDrive/FYP (Sidharrth)/Experiments/Transformers/B-features-2epochs')

### 3 epochs

In [None]:
trainer.train()

Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape


Epoch,Training Loss,Validation Loss,Acc,F1,Acc And F1,Mcc,Qwk
0,0.454938,0.475034,0.785571,0.771114,0.778343,0.645901,0.756691


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768

TrainOutput(global_step=62, training_loss=0.6014503658779206)

In [None]:
model.save_pretrained('/content/drive/MyDrive/FYP (Sidharrth)/Experiments/Transformers/B-features-3epochs')

### 4 epochs

In [None]:
trainer.train()

Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape


Epoch,Training Loss,Validation Loss,Acc,F1,Acc And F1,Mcc,Qwk
0,0.507008,0.508063,0.801603,0.790811,0.796207,0.673094,0.778689


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768

TrainOutput(global_step=62, training_loss=0.4651854850592152)

In [None]:
model.save_pretrained('/content/drive/MyDrive/FYP (Sidharrth)/Experiments/Transformers/B-features-4epochs')

### 5 epochs

In [None]:
trainer.train()

Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape


Epoch,Training Loss,Validation Loss,Acc,F1,Acc And F1,Mcc,Qwk
0,0.630662,0.558722,0.807615,0.807264,0.80744,0.687449,0.8054


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768])
Cat Feats Shape
torch.Size([2, 36])
Initializing global attention on CLS token...
Sequence Outputs Shape
torch.Size([2, 1024, 768])
Text Feats Shape
torch.Size([2, 768

TrainOutput(global_step=62, training_loss=0.36257930154040935)

In [None]:
model.save_pretrained('/content/drive/MyDrive/FYP (Sidharrth)/Experiments/Transformers/B-features-5epochs')

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./logs/runs --port=6006