In [None]:
#!pip install datasets

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
import pickle
#from datasets import load_dataset
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from transformers import AutoTokenizer

In [None]:
!pip install nlp
import nlp

In [None]:
max_length = 150
batch_size = 256
label_size = 3
entries_per_file = 512

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
#train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

In [None]:
#prems = list(train.premise.values)
#hyps = list(train.hypothesis.values)
#labels = list(train.label.values)

prems = list()
hyps = list()
labels = list()

dup_set = set()

def add_to_training(prem_list, hyps_list, lab_list):
    if len(prem_list) > 1:
        print(f'sizes before: {len(prems)} {len(hyps)} {len(labels)}')
    duplicate = 0
    for p, h, l in zip(prem_list, hyps_list, lab_list):
        k = p + h
        if k in dup_set:
            duplicate += 1
            continue
        dup_set.add(k)
        prems.append(p)
        hyps.append(h)
        labels.append(l)
    if len(prem_list) > 1:
        print(f'sizes after: {len(prems)} {len(hyps)} {len(labels)}')
    return duplicate

In [None]:
mnli = nlp.load_dataset(path='glue', name='mnli')

In [None]:
add_to_training(mnli['train']['premise'], mnli['train']['hypothesis'], mnli['train']['label'])

In [None]:
# def add_data_from_xnli(filename):
#     df = pd.read_csv('../input/xnlni-zip-file-from-nyu/XNLI-1.0/xnli.dev.tsv', sep='\t')
#     ls = [{'entailment': 0, 'neutral': 1, 'contradiction': 2}[x] for x in df.gold_label.values]
#     add_to_training(df.sentence1.values, df.sentence2.values, ls)

In [None]:
# add_data_from_xnli('../input/xnlni-zip-file-from-nyu/XNLI-1.0/xnli.dev.tsv')
# add_data_from_xnli('../input/xnlni-zip-file-from-nyu/XNLI-1.0/xnli.test.tsv')

In [None]:
snli = nlp.load_dataset(path='snli')
add_to_training(snli['train']['premise'], snli['train']['hypothesis'], snli['train']['label'])

In [None]:
args = {'max_length': max_length, 'pad_to_max_length': True, 'truncation': 'longest_first', 'return_attention_mask': True}
#args = {'return_tensors': "tf", 'max_length': max_length, 'pad_to_max_length': True, 'truncation': True, 'return_attention_mask': True, 'return_token_type_ids': True}

In [None]:
keys = ['input_ids', 'attention_mask']


def get_feature_description(feat_size, label_size=None):
    feature_description = {
        keys[0]: tf.io.FixedLenFeature([feat_size], tf.int64),
        keys[1]: tf.io.FixedLenFeature([feat_size], tf.int64)
    }

    if label_size is not None:
        feature_description.update({'labels': tf.io.FixedLenFeature([label_size], tf.float32)})

    return feature_description


def _float_feature(value):
    """Returns a float_list from a float / double."""
    value = np.ravel(value)
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    value = np.ravel(value)
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def serialize_sentence_and_label(x0, x1, y=None):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    feature = {k: _int64_feature(v) for k, v in zip(keys, [x0, x1])}
    if y is not None:
        feature['labels'] = _float_feature(y)
    proto_ = tf.train.Example(features=tf.train.Features(feature=feature))
    return proto_.SerializeToString()


def write_tf_records_with_labels(filename, x0_list, x1_list, y_list=None, max_lines=None):
    file_index = 0
    current_index = 0
    list_of_files = []
    assert len(x0_list) == len(x1_list)
    if y_list is not None:
        assert len(x0_list) == len(y_list)
    max_len = len(x0_list)
    if max_lines is None:
        max_lines = max_len
    run = True
    while run:
        fn = f'{filename}_{file_index}'
        list_of_files.append(fn)
        with tf.io.TFRecordWriter(fn) as writer:
            for i in range(current_index, min(current_index + max_lines, max_len)):
                label = None if y_list is None else y_list[i]
                example = serialize_sentence_and_label(x0_list[i], x1_list[i], label)
                writer.write(example)
            current_index = i + 1
            if current_index == max_len:
                break
        print(f'max line for file {fn}')
        file_index += 1
    return list_of_files

### turn sparse labels into one hot triplets

In [None]:
arr = np.arange(len(labels))
np.random.shuffle(arr)
def shuffle(l):
    assert len(l) == len(arr)
    return [l[i] for i in arr]
labels = shuffle(labels)
prems = shuffle(prems)
hyps = shuffle(hyps)

In [None]:
def make_triplet(labels):
    triplets = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
    one_hot_labels = np.array([triplets[n] for n in labels])
    return one_hot_labels

one_hot_labels = make_triplet(labels)

In [None]:
def round_to_batch_size(li, batch_size):
    le = len(li) // batch_size * batch_size
    return li[:le]

def encode_all(s1, s2, batch_size=None):
    assert len(s1) == len(s2)
    if batch_size is None:
        batch_size = 1
    else:
        s1, s2 = round_to_batch_size(s1, batch_size), round_to_batch_size(s2, batch_size)
    results = {k: list() for k in keys}
    nb_of_batches = len(s1) // batch_size 
    for i in range(nb_of_batches):
        if i % 1000 == 0:
            print(i)
        s1_tmp = s1[i*batch_size:(i+1)*batch_size]
        s2_tmp = s2[i*batch_size:(i+1)*batch_size]
        tok = tokenizer.batch_encode_plus([(p, h) for p, h in zip(s1_tmp, s2_tmp)], **args)
        for k in keys:
            results[k].append(tok[k])
    return results

In [None]:
train_data = encode_all(prems, hyps, batch_size=batch_size)
one_hot_labels = round_to_batch_size(one_hot_labels, batch_size)
one_hot_labels = one_hot_labels.reshape((-1, batch_size, label_size))

In [None]:
print(f'number of training exemples: {len(labels)}')
write_tf_records_with_labels('./train_data.proto', train_data[keys[0]], train_data[keys[1]], one_hot_labels, max_lines=entries_per_file)

In [None]:
!pip install googletrans
from googletrans import Translator

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
translator = Translator()
def translate_row(row):
    if row['lang_abv'] == 'en':
        return row['premise'], row['hypothesis']
    return pd.Series([translator.translate(row['premise']).text, translator.translate(row['hypothesis']).text], index=['translated_premise', 'translated_hypothesis'])

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
translated_df = test.apply(translate_row, axis=1)
prems_test = list(translated_df.translated_premise.values)
hyps_test = list(translated_df.translated_hypothesis.values)
test_data = encode_all(prems_test, hyps_test)
write_tf_records_with_labels('./test_data.proto', test_data[keys[0]], test_data[keys[1]])

### Initial training data alone

In [None]:
# train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
# prems_train = list(train.premise.values)
# hyps_train = list(train.hypothesis.values)
# one_hot_labels = make_triplet(train.label.values)
# one_hot_labels = round_to_batch_size(one_hot_labels, batch_size)
# one_hot_labels = one_hot_labels.reshape((-1, batch_size, label_size))
# train_data = encode_all(prems_train, hyps_train, batch_size=batch_size)
# write_tf_records_with_labels('./initial_train_data.proto', train_data[keys[0]], train_data[keys[1]], one_hot_labels)