In [3]:
import urllib.request
import zipfile
import os
folder_path = '/notebooks/owen/QS-Notebooks'
print('Beginning download of datasets')

datasets = ['stsbenchmark.zip']
server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"

for dataset in datasets:
    print("Download", dataset)
    url = server + dataset
    dataset_path = os.path.join(folder_path, dataset)
    urllib.request.urlretrieve(url, dataset_path)

    print("Extract", dataset)
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(folder_path)
    os.remove(dataset_path)


print("All datasets downloaded and extracted")

Beginning download of datasets
Download stsbenchmark.zip
Extract stsbenchmark.zip
All datasets downloaded and extracted


In [4]:
import math
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  SentencesDataset, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader

In [5]:
# Read the dataset
model_name = 'distilbert-base-nli-stsb-mean-tokens'
train_batch_size = 16
num_epochs = 2
model_save_path = '/notebooks/owen/QS-Notebooks'
sts_reader = STSDataReader('owen/QS-Notebooks/stsbenchmark', normalize_scores=True)

In [6]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

100%|██████████| 245M/245M [00:29<00:00, 8.43MB/s] 


In [7]:
import pandas as pd
df = pd.read_csv('SO_clean_50k.csv')

In [8]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,score,parent_id,c_title,c_reply
0,0,8,12692067,(and other unicode characters) in identifiers...,This is a known bug in GCC: Bug 67224 - UTF-8 ...
1,1,2,2996139,500 Internal Server Error in ASP.NET MVC,I got more details of the error from windows e...


In [None]:
df = df.drop(df.columns[0], axis=1)

In [None]:
sample = df.sample(10000)
sample = sample.reset_index(drop=True, inplace=False)
sample.head()

In [None]:
sample[5] = sample[5].astype(int)

In [None]:
%%time
def no_nl(text):
  return text.replace("\n", "")

sample[6] = sample[6].apply(no_nl)
sample[7] = sample[7].apply(no_nl)

In [None]:
import csv
# sample.to_csv('sample.csv', sep='\t', header=False)
with open("sample.csv", "w") as f:
  csv_writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_NONE)
  for idx, row in sample.iterrows():
    try:
      csv_writer.writerow(row)
    except:
      pass

In [None]:
%%time
# Convert the dataset to a DataLoader ready for training
print("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('/content/sample.csv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
# train_dataloader = DataLoader(train, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


print("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('/content/sample.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
# dev_dataloader = DataLoader(test, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

In [None]:
%%time
%%capture
# Configure the training. We skip evaluation in this example
print("warming up...")
warmup_steps = math.ceil(len(train_dataloader)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

print("training...")
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [None]:
# Load the stored model and evaluate its performance on STS benchmark dataset
model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
model.evaluate(evaluator)

In [None]:
# !tar -zcvf model_stuff.tar.gz /content/output/test1