# 1. Preparation


Execute the Notebooks
- prepareParaBank.ipynb
- prepareQuora.ipynb
- preparePAWS.ipynb

before continuing here.

## 1.1 Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/drive/MyDrive/Uni\ Leipzig/SS2022/.ssh ~/

## 1.2 Install necessary packages

In [None]:
%cd '/content/drive/MyDrive/Uni Leipzig/SS2022/bdlt_contrastive/dataset'

In [None]:
!pip install python-terrier

In [None]:
!pip install datasets

In [None]:
import re
import datasets
import os
import pyterrier as pt
import pandas as pd
import numpy as np


## 1.3 Create Indices using PyTerrier and retrieve negatives

In [None]:
if not pt.started():
  pt.init()

In [None]:
dataset_definitions = [
    {"path": "parabank", "name": "small_diverse"},
    {"path": "paws", "name": "quora"},
    {"path": "paws", "name": "labeled_final"},
    {"path": "paws", "name": "labeled_swap"},
    {"path": "paws", "name": "unlabeled_final"}
]

max_res = 50
total_num_sentences = 6
for definition in dataset_definitions:
    dname = definition["name"]
    dpath = definition["path"]
    for split in ["train", "validation", "test"]:
        if not os.path.isfile(f"raw/out_{dpath}_{dname}_{split}.csv"):
            continue
        anchors_with_para = pd.read_csv(f"raw/out_{dpath}_{dname}_{split}.csv").fillna("")
        
        if not os.path.isdir(f"./indices/pd_index_{dpath}_{dname}_{split}"):
            df = {"docno": [], "text": []}
            for index, row in anchors_with_para.iterrows():
                for sent in ["sentence1", "sentence2"]:
                    if sent and sent.strip() !="" and sent != "None":
                        df["text"].append(row[sent])
                        df["docno"].append(str(index) + "_" + sent)
            df = pd.DataFrame.from_dict(df)
            # index the text, record the docnos as metadata
            pd_indexer = pt.DFIndexer(f"./indices/pd_index_{dpath}_{dname}_{split}")
            indexref = pd_indexer.index(df["text"], df["docno"])
            batch_ret = pt.BatchRetrieve(indexref,num_results=max_res)
        else:
            #pd_indexer = pt.DFIndexer(f"./indices/pd_index_{dpath}_{dname}_{split}")
            batch_ret = pt.BatchRetrieve(f"./indices/pd_index_{dpath}_{dname}_{split}",num_results=max_res)

        
        if os.path.isfile(f"neg/out_{dpath}_{dname}_{split}_processed.csv"):
            continue
        
        out_list =  []
        paraphrases_only = anchors_with_para[anchors_with_para["label"] == 1]
        for idx_row,row in paraphrases_only.iterrows():
            out_dict= {}
            for i in range(1, total_num_sentences+1):
                out_dict[f"sentence{i}"] = ""
            
            out_dict["sentence1"] = row["sentence1"]
            out_dict["sentence2"] = row["sentence2"]

            s = re.sub(r'[^a-zA-Z0-9\s]', ' ', row["sentence1"])

            res = batch_ret(s)
            score_prev = -1
            is_too_similar = True
            sen_num = 3
            used_idx_nums = []
            for _ ,r in res.iterrows():
                if is_too_similar:
                    if score_prev != -1 and r['score']/score_prev < 0.7:
                        is_too_similar = False
                    else:
                        score_prev = r['score']
                        continue
                idx_num, sent = r["docno"].split("_", 1)[0],  r["docno"].split("_", 1)[1]

                # skip idx if it was already used, to make results more diverse

                if idx_num not in used_idx_nums:
                    used_idx_nums.append(idx_num)
                    out_dict[f"sentence{sen_num}"] = anchors_with_para.loc[int(idx_num)][sent]
                else:
                    continue
                
                # break if enough negatives are collected
                if sen_num == total_num_sentences:
                    break
                sen_num += 1

            all_sentences_valid = True
            for key, value in out_dict.items():
                if value == "":
                    all_sentences_valid = False
            
            if all_sentences_valid:
                out_list.append(out_dict)
                    

        out_df = pd.DataFrame(out_list)

        out_df.to_csv(f"neg/out_{dpath}_{dname}_{split}_processed.csv", index_label="idx")

## Combine all created csvs to one Dataset and add to Huggingface

In [None]:
from glob import glob

data_dir = "../dataset/neg/"
train_csvs = glob(data_dir + "*train*.csv")
test_csvs = glob(data_dir + "*test*.csv")
validation_csvs = glob(data_dir + "*validation*.csv")

dataset = datasets.load_dataset("csv", name="contrastive_paws_parabank", data_files={
    "train": train_csvs,
    "test": test_csvs,
    "validation": validation_csvs})


In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import create_repo

notebook_login()
create_repo(name=f"contrastive_paraphrases", private=True, repo_type="dataset")
