# Preparing data

In [1]:
# from debias.datasets.mnli import load_hans, load_mnli, tokenize_examples
import py_utils
from load_word_vectors import load_word_vectors
from tokenizer import NltkAndPunctTokenizer
# import tensorflow as tf
tok = NltkAndPunctTokenizer()
STOP_WORDS = frozenset([
  'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
  'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
  'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
  'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
  'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
  'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
  'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
  'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
  'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
  'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
  'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're',
  've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma',
  'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn',
   "many", "how", "de"
])

In [2]:
import pandas as pd
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
# label_maps = {"entailment": 0, "contradiction": 1, "neutral": 2}
# reverse_label_maps = {0:"entailment", 1:"contradiction", 2:"neutral"}
dataset = {}
# One: load data
dataset['train'] = pd.read_json('../../data/paraphrase_identification/qqp.train.jsonl', lines=True)
dataset['validation'] = pd.read_json('../../data/paraphrase_identification/qqp.val.jsonl', lines=True)
dataset['test'] = pd.read_json('../../data/paraphrase_identification/qqp.dev.jsonl', lines=True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset['train']['sentence1']=dataset['train']['sentence1'].apply(tok.tokenize)
dataset['train']['sentence2']=dataset['train']['sentence2'].apply(tok.tokenize)
dataset['validation']['sentence1']=dataset['validation']['sentence1'].apply(tok.tokenize)
dataset['validation']['sentence2']=dataset['validation']['sentence2'].apply(tok.tokenize)
dataset['test']['sentence1']=dataset['test']['sentence1'].apply(tok.tokenize)
dataset['test']['sentence2']=dataset['test']['sentence2'].apply(tok.tokenize)

In [5]:
dataset['validation']

Unnamed: 0,id,qid1,qid2,sentence1,sentence2,is_duplicate
0,221391,169800,76964,"[How, can, I, be, a, political, leader, ?]","[How, do, I, become, a, leader, ?]",0
1,64746,112460,112461,"[How, do, I, check, if, I, have, knock, knees,...","[How, can, I, check, knock, knees, ?]",1
2,132701,1993,212438,"[What, is, a, good, song, for, lyric, prank, ?]","[What, are, some, good, lyric, pranks, songs, ?]",1
3,10227,19843,19844,"[What, 's, the, longest, amount, of, time, tha...","[How, long, do, tobacco, /, cannabis, /, alcoh...",0
4,392146,156556,224101,"[How, long, does, it, take, before, marijuana,...","[How, long, does, it, take, weed, to, get, out...",1
...,...,...,...,...,...,...
4995,117628,191369,191370,"[How, could, the, universe, appear, out, of, n...","[What, was, there, before, the, Big, Bang, ?, ...",1
4996,293894,237883,415692,"[Is, it, better, to, study, at, NITs, or, the,...","[I, have, not, found, reliable, info, online, ...",0
4997,391201,523770,523771,"[What, are, some, good, reads, on, Buddhism, ?]","[What, are, the, best, books, on, Buddhism, ?]",1
4998,338986,466577,20519,"[I, do, n't, like, the, taste, of, water, ., W...","[Does, drinking, lemon, juice, in, the, mornin...",0


In [6]:
# feature preprocessing
voc = set()
for index, row in dataset['train'].iterrows():
    voc.update(row['sentence1'])
    voc.update(row['sentence2'])
words, vecs = load_word_vectors("crawl-300d-2M", voc)

word-vec: 1999996it [00:09, 220473.92it/s]


In [7]:
import numpy as np
import pickle
w2v = {w: v/np.linalg.norm(v) for w, v in zip(words, vecs)}
with open("w2v_cache", "wb") as f:
    pickle.dump(w2v, f)


   

In [11]:
def is_subseq(needle, haystack):
  l = len(needle)
  if l > len(haystack):
    return False
  else:
    return any(haystack[i:i+l] == needle for i in range(len(haystack)-l + 1))

dataset_to_features = {}
for name in dataset.keys():
    features = []
    for index, row in dataset[name].iterrows():
        h = [x.lower() for x in row.sentence1]
        p = [x.lower() for x in row.sentence2]
        p_words = set(p)
        n_words_in_p = sum(x in p_words for x in h)
        fe = {
        "h-is-subseq": is_subseq(h, p),
        "all-in-p": n_words_in_p == len(h),
        "percent-in-p": n_words_in_p / len(h),
        "log-len-diff": np.log(max(len(p) - len(h), 1)),
        "label": row.is_duplicate
        }    
        h_vecs = [w2v[w] for w in row.sentence1 if w in w2v]
        p_vecs = [w2v[w] for w in row.sentence2 if w in w2v]
        if len(h_vecs) > 0 and len(p_vecs) > 0:
          h_vecs = np.stack(h_vecs, 0)
          p_vecs = np.stack(p_vecs, 0)
          # [h_size, p_size]
          similarities = np.matmul(h_vecs, p_vecs.T)
          # [h_size]
          similarities = np.max(similarities, 1)
          similarities.sort()
          fe["average-sim"] = similarities.sum() / len(h)
          fe["min-similarity"] = similarities[0]
          if len(similarities) > 1:
            fe["min2-similarity"] = similarities[1]

        features.append(fe)
    dataset_to_features[name] = pd.DataFrame(features)
    dataset_to_features[name].fillna(0.0, inplace=True)


In [13]:
from sklearn.linear_model import LogisticRegression
train_df = dataset_to_features["train"]
feature_cols = [x for x in train_df.columns if x != "label"]

# class_weight='balanced' will weight the entailemnt/non-entailment examples equally
# C=100 means no regularization
lr = LogisticRegression(multi_class="auto", solver="liblinear",
                    class_weight='balanced', C=100)
lr.fit(train_df[feature_cols].values, train_df.label.values)

In [14]:
for name in dataset_to_features.keys():
  examples = dataset_to_features[name]
  pred = lr.predict_proba(dataset_to_features[name][feature_cols].values).astype(np.float32)
  y = dataset_to_features[name].label.values

  acc = np.mean(y == np.argmax(pred, 1))
  print("%s accuracy: %.4f (size=%d)" % (name, acc, len(examples)))

  dataset[name]["bias_probs"]= pred.tolist()


train two-class accuracy: 0.6694 (size=394287)
validation two-class accuracy: 0.6688 (size=5000)
test two-class accuracy: 0.6754 (size=5000)


In [None]:
# One: load data
ori_dataset = {}
ori_dataset['train'] = pd.read_json('../../data/paraphrase_identification/qqp.train.jsonl', lines=True)
ori_dataset['validation'] = pd.read_json('../../data/paraphrase_identification/qqp.val.jsonl', lines=True)
ori_dataset['test'] = pd.read_json('../../data/paraphrase_identification/qqp.dev.jsonl', lines=True)
for key in dataset.keys():
    ori_dataset[key]['bias_probs'] = dataset[key]["bias_probs"]
    temp_json = ori_dataset[key].to_json(orient='records', lines=True)
    with open('qqp_clark.'+key+'.jsonl', 'w') as json_file:
        json_file.write(temp_json)