# 1. Preparation

## 1.1 Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r /content/drive/MyDrive/Uni\ Leipzig/SS2022/.ssh ~/

## 1.2 Get correct CuDNN version
See https://github.com/googlecolab/colabtools/issues/2600



## 1.3 Install necessary packages

In [None]:
%cd '/content/drive/MyDrive/Uni Leipzig/SS2022/bdlt_contrastive/dataset'

/content/drive/MyDrive/Uni Leipzig/SS2022/bdlt_contrastive/dataset


In [None]:
!pip install python-terrier

In [None]:
import re
import datasets
import os
import pyterrier as pt
import pandas as pd

In [None]:

def para(row, anchor_col="sentence1", para_col="sentence2", label_col="label", path="glue", name="mrpc", split="train"):
  out_row = {
             "anchor": row[anchor_col],
             "para": row[para_col],
             "original_label": row[label_col],
             "path": path,
             "name": name,
             "split": split}
  return out_row

In [None]:

dataset_definitions = [
    #{"path": "glue", "name": "mrpc"},
    {"path": "paws", "name": "labeled_final"},
    {"path": "paws", "name": "labeled_swap"},
    {"path": "paws", "name": "unlabeled_final"}
]

for definition in dataset_definitions:
  dataset = datasets.load_dataset(path=definition["path"], name=definition["name"])
  for split in ["train", "validation", "test"]:
    out = []
    if split not in dataset:
      continue
    dataset_part = pd.DataFrame(dataset[split])
    #dataset_post = dataset_part.map(lambda x: para(model, tokenizer, x), num_proc=1)
    for idx, row in dataset_part.iterrows():
      out.append(para(row, split=split, path=definition["path"], name=definition["name"]))
    df = pd.DataFrame(out)
    df = df[df["original_label"] == 1]
    df = df.reset_index()
    df.to_csv(f'raw/out_{definition["path"]}_{definition["name"]}_{split}.csv', index=False)

In [None]:

if not pt.started():
  pt.init()

In [None]:
%cd '/content/drive/MyDrive/Uni Leipzig/SS2022/bdlt_contrastive/dataset'

In [None]:
dataset_definitions = [
    #{"path": "glue", "name": "mrpc"},
    {"path": "paws", "name": "labeled_final"},
    {"path": "paws", "name": "labeled_swap"},
    {"path": "paws", "name": "unlabeled_final"}
]
max_res = 50
for definition in dataset_definitions:
    dname = definition["name"]
    dpath = definition["path"]
    for split in ["train", "validation", "test"]:
        if not os.path.isfile(f"raw/out_{dpath}_{dname}_{split}.csv"):
            continue
        anchors_with_para = pd.read_csv(f"raw/out_{dpath}_{dname}_{split}.csv").fillna("")

        if not os.path.isdir(f"./indices/pd_index_{dpath}_{dname}_{split}"):
            df = {"docno": [], "text": []}
            for index, row in anchors_with_para.iterrows():
                for sent in ["anchor", "para"]:
                    if sent and sent.strip() !="" and sent != "None":
                        df["text"].append(row[sent])
                        df["docno"].append(str(index) + "_" + sent)
            df = pd.DataFrame.from_dict(df)
            print(df)
            # index the text, record the docnos as metadata
            pd_indexer = pt.DFIndexer(f"./indices/pd_index_{dpath}_{dname}_{split}")
            indexref = pd_indexer.index(df["text"], df["docno"])
            batch_ret = pt.BatchRetrieve(indexref,num_results=max_res)
        else:
            #pd_indexer = pt.DFIndexer(f"./indices/pd_index_{dpath}_{dname}_{split}")
            batch_ret = pt.BatchRetrieve(f"./indices/pd_index_{dpath}_{dname}_{split}",num_results=max_res)

        out_df = {"anchor":[], "para":[]}
        for i in range(max_res):
            out_df[f"neg{i}"] = []
        for idx_row,row in anchors_with_para.iterrows():

            out_df["anchor"].append(row["anchor"])
            out_df["para"].append(row["para"])

            s = re.sub(r'[^a-zA-Z0-9\s]', ' ', row["anchor"])

            res = batch_ret(s)
            score_prev = -1
            is_too_similar = True
            for res_num ,r in res.iterrows():
                if is_too_similar:
                    if score_prev != -1 and r['score']/score_prev < 0.7:
                        is_too_similar = False
                    else:
                        idx_num, sent = r["docno"].split("_", 1)[0],  r["docno"].split("_", 1)[1]
                        out_df[f"neg{res_num}"].append(('ignored',r['score'], anchors_with_para.loc[int(idx_num)][sent], r['docno']))
                        score_prev = r['score']
                        continue
                idx_num, sent = r["docno"].split("_", 1)[0],  r["docno"].split("_", 1)[1]
                out_df[f"neg{res_num}"].append((r['score'], anchors_with_para.loc[int(idx_num)][sent], r['docno']))

            if res.shape[0] < max_res:
                for n in range(res.shape[0], max_res):
                    out_df[f"neg{n}"].append((0, "", ""))

        out_df = pd.DataFrame(out_df)
        out_df.to_csv(f"neg/out_{dpath}_{dname}_{split}_processed.csv")

  return self.transform(*args, **kwargs)


JavaException: ignored