In [3]:
import urllib.request
import zipfile
import os
folder_path = '/notebooks/owen/QS-Notebooks'
print('Beginning download of datasets')

datasets = ['stsbenchmark.zip']
server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"

for dataset in datasets:
    print("Download", dataset)
    url = server + dataset
    dataset_path = os.path.join(folder_path, dataset)
    urllib.request.urlretrieve(url, dataset_path)

    print("Extract", dataset)
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(folder_path)
    os.remove(dataset_path)


print("All datasets downloaded and extracted")

Beginning download of datasets
Download stsbenchmark.zip
Extract stsbenchmark.zip
All datasets downloaded and extracted


In [155]:
import math
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  SentencesDataset, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSDataReader

In [156]:
# Read the dataset
model_name = 'distilbert-base-nli-stsb-mean-tokens'
train_batch_size = 16
num_epochs = 2
model_save_path = 'test1'
sts_reader = STSDataReader('/notebooks/owen/QS-Notebooks', normalize_scores=True)

In [157]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

In [158]:
import pandas as pd
df = pd.read_csv('SO_clean_5M.csv', sep='\t', lineterminator='\n')

# Format columns.
df = df[['title', 'reply']]
df = df.reindex(columns=(list('abcde') + list(df)))
df.e = 1
df.head()

# Generate some bad replies.
df_bad_reply = df.copy()
df_bad_reply = df_bad_reply.reply.shift(5)
df_bad_reply = df_bad_reply.dropna()
df_bad_reply.e = 0

# Combine good and bad replies.
df = df.append(df_bad_reply)
df = df.reset_index(drop=True)

Unnamed: 0,a,b,c,d,e,title,reply
0,,,,,1,(and other unicode characters) in identifiers...,This is a known bug in GCC: Bug 67224 - UTF-8 ...
1,,,,,1,500 Internal Server Error in ASP.NET MVC,I got more details of the error from windows e...
2,,,,,1,Cannot find yasm even though I have installed...,"Just in case of someone got here from Google, ..."
3,,,,,1,Embed Youtube videos :- with contains content...,You can use while initializing youtube sdk:En...
4,,,,,1,Embed Youtube videos :- with contains content...,"If you're embedding on a mobile app, you need ..."


In [123]:
# sample = df.sample(10000)
# sample = sample.reset_index(drop=True, inplace=False)
# sample.head()
# print(sample.c_reply[4])

In [159]:
# sample = sample.dropna(subset=['c_title', 'c_reply'])
df = df.dropna(subset=['title', 'reply'])
df.isna().sum()

a        4928574
b        4928574
c        4928574
d        4928574
e              0
title          0
reply          0
dtype: int64

In [160]:
%%time
def no_nl(text):
  return text.replace("\n", "")

# sample.c_title = sample.c_title.apply(no_nl)
# sample.c_reply = sample.c_reply.apply(no_nl)
df.title = df.title.apply(no_nl)
df.reply = df.reply.apply(no_nl)

CPU times: user 3.35 s, sys: 38.7 ms, total: 3.39 s
Wall time: 3.39 s


In [161]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42)

In [162]:
import csv

# sample.to_csv('sample.csv', sep='\t', header=False)
train.to_csv('train.csv', sep='\t', header=False)
test.to_csv('test.csv', sep='\t', header=False)

for name in ['train', 'test']:
    with open(f"{name}.csv", "w") as f:
      csv_writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_NONE)
      for idx, row in sample.iterrows():
        try:
          csv_writer.writerow(row)
        except:
          pass

In [163]:
%%time
# Convert the dataset to a DataLoader ready for training
print("Read train dataset")
train_data = SentencesDataset(sts_reader.get_examples('train.csv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

print("Read dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('test.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

Read train dataset
Read dev dataset
CPU times: user 15.2 s, sys: 29.2 ms, total: 15.2 s
Wall time: 15.2 s


In [164]:
import os

os.mkdir('test1')

In [165]:
%%time
%%capture

# Configure the training. We skip evaluation in this example
print("warming up...")
warmup_steps = math.ceil(len(train_dataloader)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

print("training...")
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=2,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

CPU times: user 1min 21s, sys: 31.1 s, total: 1min 52s
Wall time: 1min 52s


In [166]:
model.save(model_save_path)

In [167]:
%%time
# Load the stored model and evaluate its performance on STS benchmark dataset
model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=sts_reader.get_examples("stsbenchmark/sts-test.csv"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
model.evaluate(evaluator)

CPU times: user 2.72 s, sys: 167 ms, total: 2.89 s
Wall time: 2.48 s


0.4786749937338934

In [None]:
# !tar -zcvf model_stuff.tar.gz /content/output/test1

In [None]:
# tried 10k w/ no_code1 epochs: 2,4 acc: ~0.22
# tried 10k w/ no_code2 epochs: 2,4 acc: ~0.28
# tried 10k w/ no_code2 w/ no small reply epochs: 2,4 acc: ~0.29
# tried 65k w/ no_code2 & no small reply epochs: 2,4 acc: ~0.21
# tried 65k w/ a train/test set lol, no_code2, no small reply, & epochs=2: acc: ~0.18

# tried w/ 5M, epochs=2



# try with half bad examples too