In [3]:
import urllib.request
import zipfile
import os
folder_path = '/notebooks/owen/QS-Notebooks'
print('Beginning download of datasets')

datasets = ['stsbenchmark.zip']
server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"

for dataset in datasets:
    print("Download", dataset)
    url = server + dataset
    dataset_path = os.path.join(folder_path, dataset)
    urllib.request.urlretrieve(url, dataset_path)

    print("Extract", dataset)
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(folder_path)
    os.remove(dataset_path)


print("All datasets downloaded and extracted")

Beginning download of datasets
Download stsbenchmark.zip
Extract stsbenchmark.zip
All datasets downloaded and extracted


In [4]:
import math
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  SentencesDataset, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader

In [77]:
# Read the dataset
model_name = 'distilbert-base-nli-stsb-mean-tokens'
train_batch_size = 16
num_epochs = 4
model_save_path = 'test1'
sts_reader = STSDataReader('/notebooks/owen/QS-Notebooks', normalize_scores=True)

In [6]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

100%|██████████| 245M/245M [00:29<00:00, 8.43MB/s] 


In [7]:
import pandas as pd
df = pd.read_csv('SO_clean_50k.csv')

# Format columns.
df = df[['c_title', 'c_reply']]
df = df.reindex(columns=(list('abcde') + list(df)))
df.e = 1
df.head()

In [30]:
sample = df.sample(10000)
sample = sample.reset_index(drop=True, inplace=False)
sample.head()

Unnamed: 0,a,b,c,d,e,c_title,c_reply
0,,,,,1,"""pip install unroll"": ""python setup.py egg_inf...",This was the easier way for me: pip2 install ...
1,,,,,1,A const std::function wraps a non-const operat...,Is it normal that a const std::function may...
2,,,,,1,.NET method to convert a string to sentence case,A solution in F#: open System let proper (x ...
3,,,,,1,ABAP if statement,The problem is that you haven't left a space b...
4,,,,,1,AIX - awk display first occurence of each match,"awk approach: awk '$1==""SUCCESS"" ;; !a[$2,$3]..."


In [31]:
print(sample.c_reply[4])

awk approach:  awk '$1=="SUCCESS" ;; !a[$2,$3]++{print $2,$3}' RESULTS.txt   The output:  JON DOE JANE DOE     Alternative approach using sed + uniq pipeline:  sed -n 's/SUCCESS \(.*\)/\1/p' RESULTS.txt | uniq  


In [40]:
sample = sample[~sample.c_title.isna()]
df1.shape

(9999, 7)

In [41]:
%%time
def no_nl(text):
  return text.replace("\n", "")

sample.c_title = sample.c_title.apply(no_nl)
sample.c_reply = sample.c_reply.apply(no_nl)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11.7 ms


In [42]:
import csv

sample.to_csv('sample.csv', sep='\t', header=False)

with open("sample.csv", "w") as f:
  csv_writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_NONE)
  for idx, row in sample.iterrows():
    try:
      csv_writer.writerow(row)
    except:
      pass

In [48]:
%%time
# Convert the dataset to a DataLoader ready for training
print("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sample.csv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


print("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sample.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

Read STSbenchmark train dataset
Read STSbenchmark dev dataset
CPU times: user 13.6 s, sys: 0 ns, total: 13.6 s
Wall time: 13.6 s


In [79]:
import os

os.mkdir('test1')

In [None]:
%%time
%%capture

# Configure the training. We skip evaluation in this example
print("warming up...")
warmup_steps = math.ceil(len(train_dataloader)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

print("training...")
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=4,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
model.save(model_save_path)

In [None]:
%%time
# Load the stored model and evaluate its performance on STS benchmark dataset
model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=sts_reader.get_examples("stsbenchmark/sts-test.csv"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
model.evaluate(evaluator)

In [None]:
# !tar -zcvf model_stuff.tar.gz /content/output/test1