In [1]:
!pip install -q sentence-transformers

[0m

In [2]:
import transformers
import sentence_transformers

print("Transformers:", transformers.__version__)
print("Sentence Transformers Version", sentence_transformers.__version__)


Transformers: 4.27.4
Sentence Transformers Version 2.2.2


In [3]:
import warnings
from transformers import logging

warnings.filterwarnings('ignore')
logging.set_verbosity_error()


In [4]:
import random, os
import numpy as np
import torch

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(42)


In [5]:
import datasets
import pandas as pd
from ast import literal_eval

datasets.disable_caching()

data_dir = '/kaggle/input/vispamdataset-v2/preprocessed/'

def load_data(data_dir):
    train_df = pd.read_csv(data_dir + 'train.csv', converters={'categories': literal_eval})
    dev_df = pd.read_csv(data_dir + 'dev.csv', converters={'categories': literal_eval})
    test_df = pd.read_csv(data_dir + 'test.csv', converters={'categories': literal_eval})
    
    str_columns = ['comment', 'clean_comment', 'category', 'product_name', 'description', 'clean_description']
    train_df[str_columns] = train_df[str_columns].astype(str)
    dev_df[str_columns] = dev_df[str_columns].astype(str)
    test_df[str_columns] = test_df[str_columns].astype(str)
    
    train_dataset = datasets.Dataset.from_dict(train_df)
    dev_dataset = datasets.Dataset.from_dict(dev_df)
    test_dataset = datasets.Dataset.from_dict(test_df)
    dataset_dict = datasets.DatasetDict({'train': train_dataset, 'dev': dev_dataset, 'test': test_dataset})
    
    return dataset_dict

vispam_datasets = load_data(data_dir)


# Task 1

In [6]:
import os
import sys
import math
from torch.utils.data import DataLoader
from sentence_transformers import models, losses
from sentence_transformers import SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

model_name = "vinai/phobert-base"
model_save_path = f'output/training_nli_{model_name.replace("/", "-")}-task-1'
word_embedding_model = models.Transformer(model_name)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_samples = []
for comment, description, label in zip(vispam_datasets['train']['clean_tokenized_comment'],
                                       vispam_datasets['train']['clean_tokenized_description'],
                                       vispam_datasets['train']['label']):
    train_samples.append(InputExample(texts=[comment, description], label=label))

train_batch_size = 16
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

dev_samples = []
for comment, description, label in zip(vispam_datasets['dev']['clean_tokenized_comment'],
                                       vispam_datasets['dev']['clean_tokenized_description'],
                                       vispam_datasets['dev']['label']):
    dev_samples.append(InputExample(texts=[comment, description], label=label))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='vispam-dev-task-1')

num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

print(f"[Task 1] - Start training with {num_epochs} epochs...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=5000,
    warmup_steps=warmup_steps,
    output_path=model_save_path
)


Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

[Task 1] - Start training with 10 epochs...


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

In [7]:
test_samples = []
for comment, description, label in zip(vispam_datasets['test']['clean_tokenized_comment'],
                                       vispam_datasets['test']['clean_tokenized_description'],
                                       vispam_datasets['test']['label']):
    test_samples.append(InputExample(texts=[comment, description], label=label))

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='vispam-test-task-1')
test_evaluator(model, output_path=model_save_path)


0.42649114560032014

# Task 2

In [8]:
import os
import sys
import math
from torch.utils.data import DataLoader
from sentence_transformers import models, losses
from sentence_transformers import SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

model_name = "vinai/phobert-base"
model_save_path = f'output/training_nli_{model_name.replace("/", "-")}-task-2'
word_embedding_model = models.Transformer(model_name)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_samples = []
for comment, description, label in zip(vispam_datasets['train']['clean_tokenized_comment'],
                                       vispam_datasets['train']['clean_tokenized_description'],
                                       vispam_datasets['train']['spam_label']):
    train_samples.append(InputExample(texts=[comment, description], label=label))

train_batch_size = 16
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=4)

dev_samples = []
for comment, description, label in zip(vispam_datasets['dev']['clean_tokenized_comment'],
                                       vispam_datasets['dev']['clean_tokenized_description'],
                                       vispam_datasets['dev']['spam_label']):
    dev_samples.append(InputExample(texts=[comment, description], label=label))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='vispam-dev-task-2')

num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

print(f"[Task 2] - Start training with {num_epochs} epochs...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=5000,
    warmup_steps=warmup_steps,
    output_path=model_save_path
)


[Task 2] - Start training with 10 epochs...


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

Iteration:   0%|          | 0/893 [00:00<?, ?it/s]

In [9]:
test_samples = []
for comment, description, label in zip(vispam_datasets['test']['clean_tokenized_comment'],
                                       vispam_datasets['test']['clean_tokenized_description'],
                                       vispam_datasets['test']['spam_label']):
    test_samples.append(InputExample(texts=[comment, description], label=label))

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='vispam-test-task-2')
test_evaluator(model, output_path=model_save_path)


0.4242604130920679