In [None]:
!pip install sentence_transformers

from google.colab import drive
drive.mount('/content/drive')

from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime

import torch
import numpy as np
import pandas as pd

import logging
import os
import random
from collections import defaultdict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 30.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 80.2 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 91.2 MB/s 
Collecting tokenizers

In [None]:
print(torch.cuda.is_available())

True


In [None]:

logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

# Inspired from torchnlp
def trec_dataset():
    
    df_train = pd.read_csv('/content/drive/MyDrive/TCC/df_train.txt')
    df_test = pd.read_csv('/content/drive/MyDrive/TCC/df_test.txt')
    
    def limpa(x):
        return [float(n) for n in (x.strip('[]').replace(' ',',').replace(',,',',').split(',')) if n != '' ]

    df_train['Embedding'] = df_train['Embedding'].apply(lambda x: limpa(x))
    df_test['Embedding'] = df_test['Embedding'].apply(lambda x: limpa(x))

    def limpa2(x):
        return int(x.strip('[]'))

    df_train['Genre_Class'] = df_train['Genre_Class'].apply(lambda x: limpa2(x))
    df_test['Genre_Class'] = df_test['Genre_Class'].apply(lambda x: limpa2(x))
    
    label_map = {"Science Fiction":0,
                 "Novel":1,
                 "Fantasy":2,
                 "Children's literature":3,
                 "Mystery":4}

    ret = []
    aaaa = 10
    for data in [df_train, df_test]:

        examples = []
        
        for index, row in data.iterrows():
            examples.append(InputExample(guid=index, texts=[row['Summary']], label=row['Genre_Class']))
            if aaaa == 0:
                print((index, [row['Summary']], row['Genre_Class'])) 
            aaaa -= 1
            
        ret.append(examples)

    train_set, test_set = ret
    dev_set = None
    
    validation_dataset_nb=100
    
    # Create a dev set from train set
    if validation_dataset_nb > 0:
        dev_set = train_set[-validation_dataset_nb:]
        train_set = train_set[:-validation_dataset_nb]

    # For dev & test set, we return triplets (anchor, positive, negative)
    random.seed(42) #Fix seed, so that we always get the same triplets
    dev_triplets = triplets_from_labeled_dataset(dev_set)
    test_triplets = triplets_from_labeled_dataset(test_set)

    return train_set, dev_triplets, test_triplets


def triplets_from_labeled_dataset(input_examples):
    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a
    # positive instance with the same label and a negative instance with a different label
    triplets = []
    label2sentence = defaultdict(list)
    
    for inp_example in input_examples:
        label2sentence[inp_example.label].append(inp_example)

    for inp_example in input_examples:
        anchor = inp_example

        if len(label2sentence[inp_example.label]) < 2: #We need at least 2 examples per label to create a triplet
            continue

        positive = None
        while positive is None or positive.guid == anchor.guid:
            positive = random.choice(label2sentence[inp_example.label])

        negative = None
        while negative is None or negative.label == anchor.label:
            negative = random.choice(input_examples)

        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets





### Create a torch.DataLoader that passes training batch instances to our model

output_path = (
    "/content/drive/MyDrive/TCC/finetune-batch-hard-trec-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
num_epochs = 3
train_batch_size = 16

logging.info("Loading TREC dataset")
train_set, dev_set, test_set = trec_dataset()

# We create a special dataset "SentenceLabelDataset" to wrap out train_set
# It will yield batches that contain at least two samples with the same label
train_data_sampler = SentenceLabelDataset(train_set)
train_dataloader = DataLoader(train_data_sampler, batch_size=train_batch_size, drop_last=True)
print(len(train_dataloader))

2022-07-23 00:26:12 - Loading TREC dataset
(10, [' The few thousand humans survive in a nebula of relatively breathable air, existing in divided communities. The society is highly stratified, with the elite living on the  Raft   the remains of the starship that contains almost all the high technology , workers miners living on various  Belt  worlds  where they mine burned out star kernels , and the  Boneys , a nomadic band of  unmentionables  who live on worlds created out of corpses. It is unknown exactly how the humans came to the universe, but hints within the story indicate that the Raft ship came through a rift in our universe into this alternate reality  though whether intentionally or unintentionally is never specified . A glimpse of the high gravity universe is seen in the book Ring, implying that the humans in Raft came from the main universe of the Xeelee Sequence, although during which time period they escaped is not clear. The alternate universe the humans live in follows s

In [None]:

# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'all-distilroberta-v1'

# Load pretrained model
logging.info("Load model")
model = SentenceTransformer(model_name)
model.max_seq_length = 512
print("Max Sequence Length:", model.max_seq_length)

### Triplet losses ####################
### There are 4 triplet loss variants:
### - BatchHardTripletLoss
### - BatchHardSoftMarginTripletLoss
### - BatchSemiHardTripletLoss
### - BatchAllTripletLoss
#######################################

#train_loss = losses.BatchAllTripletLoss(model=model)
#train_loss = losses.BatchHardTripletLoss(model=model)
#train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
train_loss = losses.BatchSemiHardTripletLoss(model=model)


logging.info("Read TREC val dataset")
dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name='trec-dev')

logging.info("Performance before fine-tuning:")
dev_evaluator(model)

warmup_steps = int(len(train_dataloader) * num_epochs  * 0.05)  # 5% of train data
print(warmup_steps)

2022-07-23 00:26:18 - Load model
2022-07-23 00:26:18 - Load pretrained SentenceTransformer: all-distilroberta-v1
2022-07-23 00:26:18 - Use pytorch device: cuda
Max Sequence Length: 512
2022-07-23 00:26:18 - Read TREC val dataset
2022-07-23 00:26:18 - Performance before fine-tuning:
2022-07-23 00:26:18 - TripletEvaluator: Evaluating the model on trec-dev dataset:
2022-07-23 00:26:20 - Accuracy Cosine Distance:   	60.00
2022-07-23 00:26:20 - Accuracy Manhattan Distance:	62.00
2022-07-23 00:26:20 - Accuracy Euclidean Distance:	60.00

24


In [None]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=output_path,
)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/161 [00:00<?, ?it/s]

2022-07-23 00:27:25 - TripletEvaluator: Evaluating the model on trec-dev dataset after epoch 0:
2022-07-23 00:27:27 - Accuracy Cosine Distance:   	85.00
2022-07-23 00:27:27 - Accuracy Manhattan Distance:	85.00
2022-07-23 00:27:27 - Accuracy Euclidean Distance:	85.00

2022-07-23 00:27:27 - Save model to /content/drive/MyDrive/TCC/finetune-batch-hard-trec-2022-07-23_00-26-12


Iteration:   0%|          | 0/161 [00:00<?, ?it/s]

2022-07-23 00:28:29 - TripletEvaluator: Evaluating the model on trec-dev dataset after epoch 1:
2022-07-23 00:28:31 - Accuracy Cosine Distance:   	91.00
2022-07-23 00:28:31 - Accuracy Manhattan Distance:	88.00
2022-07-23 00:28:31 - Accuracy Euclidean Distance:	91.00

2022-07-23 00:28:31 - Save model to /content/drive/MyDrive/TCC/finetune-batch-hard-trec-2022-07-23_00-26-12


Iteration:   0%|          | 0/161 [00:00<?, ?it/s]

2022-07-23 00:29:34 - TripletEvaluator: Evaluating the model on trec-dev dataset after epoch 2:
2022-07-23 00:29:35 - Accuracy Cosine Distance:   	90.00
2022-07-23 00:29:35 - Accuracy Manhattan Distance:	90.00
2022-07-23 00:29:35 - Accuracy Euclidean Distance:	90.00



In [None]:

##############################################################################
#
# Load the stored model and evaluate its performance on TREC dataset
#
##############################################################################

logging.info("Evaluating model on test set")
#model_saved = SentenceTransformer('/content/drive/MyDrive/TCC/finetune-batch-hard-trec-2022-07-23_00-26-12')
model_saved = model
test_evaluator = TripletEvaluator.from_input_examples(test_set, name='trec-test')
model_saved.evaluate(test_evaluator)

model.save('/content/drive/MyDrive/TCC/finetune-test')

2022-07-23 00:43:51 - Evaluating model on test set
2022-07-23 00:43:51 - TripletEvaluator: Evaluating the model on trec-test dataset:
2022-07-23 00:44:07 - Accuracy Cosine Distance:   	84.49
2022-07-23 00:44:07 - Accuracy Manhattan Distance:	84.23
2022-07-23 00:44:07 - Accuracy Euclidean Distance:	84.49

2022-07-23 00:44:07 - Save model to /content/drive/MyDrive/TCC/finetune-test


In [None]:
logging.info("Evaluating model on test set")
model_saved = SentenceTransformer('/content/drive/MyDrive/TCC/finetune-test')
test_evaluator = TripletEvaluator.from_input_examples(test_set, name='trec-test')
model_saved.evaluate(test_evaluator)

2022-07-23 00:44:25 - Evaluating model on test set
2022-07-23 00:44:25 - Load pretrained SentenceTransformer: /content/drive/MyDrive/TCC/finetune-test
2022-07-23 00:44:26 - Use pytorch device: cuda
2022-07-23 00:44:26 - TripletEvaluator: Evaluating the model on trec-test dataset:
2022-07-23 00:44:42 - Accuracy Cosine Distance:   	84.49
2022-07-23 00:44:42 - Accuracy Manhattan Distance:	84.23
2022-07-23 00:44:42 - Accuracy Euclidean Distance:	84.49



0.8449477351916377