In [1]:
!pip install datasets
!pip install transformers
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 30.9 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.4 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 62.2 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 67.7 MB/s 
Collecting mul

In [6]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-377ead66-67b4-e641-b979-66f495b9afb7)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import datasets
from datasets import load_dataset, Dataset, DatasetDict
df = pd.read_csv("train.csv", index_col=0)
df = df.reset_index(drop=True)
df['is_duplicate'] = df['is_duplicate'].astype('float')
df.rename(columns={'name_1': 'premise', 'name_2': 'hypothesis', "is_duplicate":"label"}, inplace=True)
train, test = train_test_split(df, test_size=0.4, random_state=0, stratify=df['label'])
test, val = train_test_split(test, test_size=0.5, random_state=0, stratify=test['label'])

In [6]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(train, preserve_index=False),
    "test": Dataset.from_pandas(test, preserve_index=False),
    "validation": Dataset.from_pandas(val, preserve_index=False)
    })

In [7]:
from sentence_transformers import InputExample, SentenceTransformer, losses, evaluation

train_examples = []
train_data = dataset['train']
# For agility we only 1/2 of our available data
train_n_examples = dataset['train'].num_rows

for i in range(train_n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['premise'], example['hypothesis']], label=example['label']))

val_examples = []
val_data = dataset['validation']
# For agility we only 1/2 of our available data
val_n_examples = dataset['validation'].num_rows

for i in range(val_n_examples):
  example = val_data[i]
  val_examples.append(InputExample(texts=[example['premise'], example['hypothesis']], label=example['label']))

test_examples = []
test_data = dataset['test']
# For agility we only 1/2 of our available data
test_n_examples = dataset['test'].num_rows

for i in range(test_n_examples):
  example = test_data[i]
  test_examples.append(InputExample(texts=[example['premise'], example['hypothesis']], label=example['label']))

In [8]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
evaluator = evaluation.EmbeddingSimilarityEvaluator(val.premise.to_numpy(), val.hypothesis.to_numpy(), val.label.to_numpy())

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [14]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.CosineSimilarityLoss(model)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=50,
          warmup_steps=100,
          evaluator=evaluator,
          evaluation_steps=18668,
          save_best_model=True,
          output_path='/content/drive/MyDrive/bert-model')

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4668 [00:00<?, ?it/s]

In [9]:
model.load("/content/drive/MyDrive/bert-model")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [34]:
evaluator = evaluation.BinaryClassificationEvaluator(list(test.premise.to_numpy()), list(test.hypothesis.to_numpy()), list(test.label.to_numpy()),  show_progress_bar=True, write_csv=True)

In [36]:
scores = model.evaluate(evaluator=evaluator, output_path='class_eval')

Batches:   0%|          | 0/531 [00:00<?, ?it/s]