# Memorability from improved tokenizers

We shall add/remove tokens from a pretrained BERT Tokenizer, and learn the
task from the new embeddings.

In [2]:
from torch.nn import MSELoss
from dataset_definition.videomem import VideoMem
from data_preprocess.datasets import SentencesDataset
from data_preprocess.tokenizers import CustomBertTokenizer
from metrics.regression_metrics import RegressionMetrics
from neural_models.transformers import CustomBert
from workflow.kfolds import KFoldExperiment

DATASET_PATH = "/Users/ricardokleinlein/Desktop/Thesis/kikaiGakushu/DATA/videomem.csv"
LABELS_FIELD = 'short-term_memorability'
CONTROL_METRIC = 'loss'
BATCH_SIZE = 8
PATIENCE = 5

videomem = VideoMem(DATASET_PATH)

# Build custom tokenizer - Term Frequency
tokenizer_wrap = CustomBertTokenizer('bert-base-uncased')
added_vocab = tokenizer_wrap.add_tokens_tf(videomem.df['description'],
                                           top_n=50)

# Instantiate the model accordingly
learning_model = CustomBert(num_classes=1)
learning_model.resize_embeddings_layer(len(tokenizer_wrap))

regression_metrics = RegressionMetrics()
experiment = KFoldExperiment(data_reader=SentencesDataset,
                             metrics=regression_metrics,
                             monitor_metric=CONTROL_METRIC,
                             patience=PATIENCE,
                             tokenizer=tokenizer_wrap.tokenizer)

results_logging = experiment.run(X=videomem.df['description'],
                                 target=videomem.df[LABELS_FIELD],
                                 model=learning_model,
                                 loss_fn=MSELoss(),
                                 batch_size=BATCH_SIZE)

Loading dataset VideoMem from /Users/ricardokleinlein/Desktop/Thesis/kikaiGakushu/DATA/videomem.csv
['addo', 'aerialof', 'bikers', 'businesspeople', 'businesswoman', 'campfire', 'cellphone', 'climber', 'closeup', 'commuting', 'dci', 'decorating', 'dollyof', 'gangsters', 'headphones', 'headset', 'jellyfish', 'jog', 'kayak', 'khr', 'lapse', 'lettuce', 'motionof', 'mov', 'panning', 'panningof', 'peddling', 'piggyback', 'pov', 'selfie', 'slowmotion', 'steadicam', 'steadicamof', 'steadycam', 'summertime', 'sunbed', 'sunlit', 'tasty', 'texting', 'timelapse', 'touchpad', 'trackingof', 'trendy', 'umbrellas', 'upof', 'videoblocks', 'welness', 'wideof', 'womans', 'zooming']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[NEW FOLD: 1/5]
Computing initial model performance...


  7%|▋         | 11/160 [02:53<39:05, 15.74s/it]


KeyboardInterrupt: 