# Generating Experiment Datasets for Translator Classification

Having aligned our book source and translated text and implemented paragraph sorting/filtering based on semantic similarity, we can now generate the train/val/test datasets for our experiments.

In our experiments we want to compare various input/data settings:
1. Filtered vs unfiltered paragraph alignments - we hypothesize that removing poor alignments will improve translator classification performance.
2. Multilingual-BERT Tokenization - we explore if passing the Russian source with its English translation improves performances compared to only passing just the English translated paragraph.

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch

if torch.cuda.is_available():
  dev = "cuda"
else:
  dev = "cpu"
device = torch.device(dev)

print(device)

cuda


## SIM Score Metric

Having performed alignment evaluation based on a number of different semantic similarity metrics, we chose to use the SIM similarity metric, as did the par3 paper which created the paragraph alignment method we used.

Paper: SIM score metric from [Beyond BLEU: Training Neural Machine Translation with Semantic Similarity](https://arxiv.org/pdf/1909.06694)

Implementation from [https://github.com/katherinethai/par3](https://github.com/katherinethai/par3)

To use the SIM score metric, load in the sim models files from the par3 directory you cloned: \
`%load par3/par3_align/similarity/sim_models.py`\
`%load par3/par3_align/similarity/sim_utils.py`\
`%load par3/par3_align/similarity/test_sim.py`

In [3]:
# %load par3/par3_align/similarity/sim_models.py
import torch
import torch.nn as nn
from torch.nn.modules.distance import CosineSimilarity
import numpy as np

class ParaModel(nn.Module):

    def __init__(self, args, vocab):
        super(ParaModel, self).__init__()

        self.args = args
        self.vocab = vocab
        self.gpu = args.gpu

        self.cosine = CosineSimilarity()

    def compute_mask(self, lengths):

        lengths = lengths.cpu()
        max_len = torch.max(lengths)
        range_row = torch.arange(0, max_len).long()[None, :].expand(lengths.size()[0], max_len)
        mask = lengths[:, None].expand_as(range_row)
        mask = range_row < mask
        mask = mask.float()
        if self.gpu >= 0:
            mask = mask.cuda()
        return mask

    def torchify_batch(self, batch):

        max_len = 0
        for i in batch:
            if len(i.embeddings) > max_len:
                max_len = len(i.embeddings)

        batch_len = len(batch)

        np_sents = np.zeros((batch_len, max_len), dtype='int32')
        np_lens = np.zeros((batch_len,), dtype='int32')

        for i, ex in enumerate(batch):
            np_sents[i, :len(ex.embeddings)] = ex.embeddings
            np_lens[i] = len(ex.embeddings)

        idxs, lengths, masks = torch.from_numpy(np_sents).long(), \
                               torch.from_numpy(np_lens).float().long(), \
                               self.compute_mask(torch.from_numpy(np_lens).long())

        if self.gpu >= 0:
            idxs = idxs.cuda()
            lengths = lengths.cuda()
            masks = masks.cuda()
    
        return idxs, lengths, masks

    def scoring_function(self, g_idxs1, g_mask1, g_lengths1, g_idxs2, g_mask2, g_lengths2):

        g1 = self.encode(g_idxs1, g_mask1, g_lengths1)
        g2 = self.encode(g_idxs2, g_mask2, g_lengths2)
        return self.cosine(g1, g2)

class WordAveraging(ParaModel):

    def __init__(self, args, vocab):
        super(WordAveraging, self).__init__(args, vocab)

        self.vocab = vocab
        self.embedding = nn.Embedding(len(self.vocab), self.args.dim)

        if args.gpu >= 0:
           self.cuda()

    def encode(self, idxs, mask, lengths):
        word_embs = self.embedding(idxs)
        word_embs = word_embs * mask[:, :, None]
        g = word_embs.sum(dim=1) / lengths[:, None].float()
        return g

In [4]:
# %load par3/par3_align/similarity/sim_utils.py
import io
import numpy as np
import torch

def get_wordmap(textfile):
    words={}
    We = []
    f = io.open(textfile, 'r', encoding='utf-8')
    lines = f.readlines()
    if len(lines[0].split()) == 2:
        lines.pop(0)
    ct = 0
    for (n,i) in enumerate(lines):
        word = i.split(' ', 1)[0]
        vec = i.split(' ', 1)[1].split(' ')
        j = 0
        v = []
        while j < len(vec):
            v.append(float(vec[j]))
            j += 1
        words[word] = ct
        ct += 1
        We.append(v)
    return words, np.array(We)

def get_minibatches_idx(n, minibatch_size, shuffle=False):
    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

def max_pool(x, lengths, gpu):
    out = torch.FloatTensor(x.size(0), x.size(2)).zero_()
    if gpu >= 0:
        out = out.cuda()
    for i in range(len(lengths)):
        out[i] = torch.max(x[i][0:lengths[i]], 0)[0]
    return out

def mean_pool(x, lengths, gpu):
    out = torch.FloatTensor(x.size(0), x.size(2)).zero_()
    if gpu >= 0:
        out = out.cuda()
    for i in range(len(lengths)):
        out[i] = torch.mean(x[i][0:lengths[i]], 0)
    return out

def lookup(words, w):
    w = w.lower()
    if w in words:
        return words[w]

class Example(object):

    def __init__(self, sentence):
        self.sentence = sentence.strip().lower()
        self.embeddings = []
        self.representation = None

    def populate_embeddings(self, words):
        sentence = self.sentence.lower()
        arr = sentence.split()
        for i in arr:
            emb = lookup(words, i)
            if emb:
                self.embeddings.append(emb)
        if len(self.embeddings) == 0:
            self.embeddings.append(words['UUUNKKK'])

In [5]:
# %load par3/par3_align/similarity/test_sim.py
import torch
from nltk.tokenize import TreebankWordTokenizer
import sentencepiece as spm

tok = TreebankWordTokenizer()

model = torch.load('/home/kkatsy/par3/par3_align/similarity/sim/sim.pt')
state_dict = model['state_dict']
vocab_words = model['vocab_words']
args = model['args']
# turn off gpu
model = WordAveraging(args, vocab_words)
model.load_state_dict(state_dict, strict=True)
sp = spm.SentencePieceProcessor()
sp.Load('/home/kkatsy/par3/par3_align/similarity/sim/sim.sp.30k.model')
model.eval()

def make_example(sentence, model):
    sentence = sentence.lower()
    sentence = " ".join(tok.tokenize(sentence))
    sentence = sp.EncodeAsPieces(sentence)
    wp1 = Example(" ".join(sentence))
    wp1.populate_embeddings(model.vocab)
    return wp1

def find_similarity(s1, s2):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        s2 = [make_example(x, model) for x in s2]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        wx2, wl2, wm2 = model.torchify_batch(s2)
        BATCH_SIZE = 512
        all_scores = []
        for i in range(0, len(wx1), BATCH_SIZE):
            scores = model.scoring_function(wx1[i:i + BATCH_SIZE], wm1[i:i + BATCH_SIZE], wl1[i:i + BATCH_SIZE],
                                            wx2[i:i + BATCH_SIZE], wm2[i:i + BATCH_SIZE], wl2[i:i + BATCH_SIZE])
            all_scores.extend([x.item() for x in scores])
        return all_scores

def find_similarity_matrix(s1, s2):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        s2 = [make_example(x, model) for x in s2]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        wx2, wl2, wm2 = model.torchify_batch(s2)

        BATCH_SIZE = 2000
        vecs1 = []
        vecs2 = []
        for i in range(0, len(wx1), BATCH_SIZE):
            curr_vecs1 = model.encode(idxs=wx1[i:i + BATCH_SIZE],
                                      mask=wm1[i:i + BATCH_SIZE],
                                      lengths=wl1[i:i + BATCH_SIZE])
            vecs1.append(curr_vecs1)
        for i in range(0, len(wx2), BATCH_SIZE):
            curr_vecs2 = model.encode(idxs=wx2[i:i + BATCH_SIZE],
                                      mask=wm2[i:i + BATCH_SIZE],
                                      lengths=wl2[i:i + BATCH_SIZE])
            vecs2.append(curr_vecs2)
        vecs1 = torch.cat(vecs1)
        vecs2 = torch.cat(vecs2)
        dot_product = torch.matmul(vecs1, vecs2.t())

        vecs1_norm = torch.norm(vecs1, dim=1, keepdim=True)
        vecs2_norm = torch.norm(vecs2, dim=1, keepdim=True)
        norm_product = torch.matmul(vecs1_norm, vecs2_norm.t())
    return torch.div(dot_product, norm_product)

def encode_text(s1):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        vecs1 = model.encode(idxs=wx1, mask=wm1, lengths=wl1)
        return vecs1


In [6]:
tok = TreebankWordTokenizer()

model = torch.load('/home/kkatsy/par3/par3_align/similarity/sim/sim.pt')
state_dict = model['state_dict']
vocab_words = model['vocab_words']
args = model['args']
# turn off gpu
model = WordAveraging(args, vocab_words)
model.load_state_dict(state_dict, strict=True)
sp = spm.SentencePieceProcessor()
sp.Load('/home/kkatsy/par3/par3_align/similarity/sim/sim.sp.30k.model')
model.eval()

WordAveraging(
  (cosine): CosineSimilarity()
  (embedding): Embedding(65733, 300)
)

In [7]:
def get_score(refs, cands, metric='sim'):
    return find_similarity(refs,cands)

### Load in aligned paragraph dataset

In [8]:
import pickle

with open('aligned_paragraph_dataset.pickle', 'rb') as fp:
  aligned_paragraph_dataset = pickle.load(fp)

with open('source_paragraph_dataset.pickle', 'rb') as fp:
  source_paragraph_dataset = pickle.load(fp)

In [9]:
aligned_paragraph_dataset.keys()

dict_keys(['DeadSouls', 'FathersAndSons', 'PoorFolk', 'Demons', 'AnnaKarenina', 'NotesFromUnderground', 'TheBrothersKaramazov', 'TheIdiot', 'CrimeAndPunishment'])

### Sorting + Filtering Aligned Paragraph Data

We want to be able to filter out certain paragraph alignments, depending on what experiments we are running. 

We can filter by:
- choosing how many of the aligned paragraphs to keep with the highest semantic similarity scores
- choosing what percent of the top scoring aligned paragraphs to drop to remove perfect or near-perfect matches in the different translations since it gives no info on translator style
- choosing min and max len of the aligned paragraphs: some aligned "paragraphs" are shorter than a sentence or only one or a couple words; some paragraphs are too long to be handled by multilingual-BERT tokenization
- choosing scale of diff in length between the source and translated paragraphs; translations should approximately correspond to the length of the original paragraph

In [10]:
import itertools
from statistics import mean
from operator import itemgetter

"""
Sort + filter out aligned paragraphs based on the SIM metric.

Args:
        par_list: list of aligned par lists, [[par1_tr1, par1_tr2, par1_tr3], [par2_tr1, par2_tr2, par2_tr3], ...]
        top_k: num of aligned pars to return with highest similarity scores post-filtering for length
        drop_top: percent of aligned pars
        min_len: drop aligned pars where pars are shorter than min length
        max_len: drop aligned pars where pars are longer than max length
        align_scale: max permitted diff in len between source par and translated par

    Returns:
        i2score: list of tuples, (aligned paragraph index, similarity score of paragraph translations) for all aligned paragraphs in list
        top_k_scores: list of tuples, (aligned paragraph index, similarity score of paragraph translations) for top k aligned paragraphs in list post-filtering
"""

def get_best_alignments(par_list, source_par_list, top_k_percent, num_k, drop_top, metric, min_len, max_len, align_scale):

    # dict -> score:par_set
    # iter thru par_list, prune by length, get metric for set
    keep_index_list = []
    i2score = {}
    for i in range(len(par_list)):
        keep_index_list.append(i)
        par_set = par_list[i]

        max_par_len = len(max(par_set, key = len))
        min_par_len = len(min(par_set, key = len))
        source_len = len(source_par_list[i])

        if (min_par_len >= min_len) and (max_par_len) <= max_len and not all(x==par_set[0] for x in par_set) and (max_par_len <= align_scale*source_len) and (min_par_len*align_scale >= source_len):

            pairs = list(itertools.combinations(par_set, 2))
            refs, cands = [], []
            for s1, s2 in pairs:
                refs.append(s1)
                cands.append(s2)
                
            pair_scores = get_score(refs, cands, metric)

            average_score = mean(pair_scores)
            i2score[i] = average_score

    # get top k par sets
    num_pars = len(list(i2score))
    top_k = int(top_k_percent * num_pars)
    if top_k >= num_k:
        top_k_scores = sorted(i2score.items(), key=itemgetter(1), reverse=True)[int(num_pars*drop_top):int(num_pars*drop_top) + num_k]
    else:
        top_k_scores = sorted(i2score.items(), key=itemgetter(1), reverse=True)[int(num_pars*drop_top):int(num_pars*drop_top) + top_k]
    
    i2score = sorted(i2score.items(), key=itemgetter(1), reverse=True)
    return i2score, top_k_scores, keep_index_list

### Set Filtering Parameters:

In [11]:
min_paragraph_len = 20
max_paragraph_len = 1000000000000
top_k_percent = 0.9
num_k = 50000
drop_top = 0.02
align_scale = 3

### Create Experiment Dataset

Indicate which books will be assigned to the holdout set and what specific books or translators you want to not be included in the train/val/test sets.

In [12]:
# Holdout
# NotesFromUnderground - Katz, PV, Garnett, Hogarth
# PoorFolk - McDuff, Hogarth, Garnett
# TheIdiot - Garnett, McDuff, PV
# CrimeAndPunishment - Katz, McDuff, PV, Garnett

holdout_books = ['TheIdiot', 'NotesFromUnderground']
ignore_books = []
ignore_translator = ['Hogarth']
translator_to_pars = {}
translator_to_pars_holdout = {}

# for each book in train:
for book in sorted(list(aligned_paragraph_dataset.keys())):
    # get par list of aligned sentences, best k alignments
    book_par_list = [list(aligned_paragraph_dataset[book][p].values()) for p in range(len(aligned_paragraph_dataset[book]))]
    source_par_list = source_paragraph_dataset[book]

    if book in holdout_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, 1.0, 5000, 0, 'sim', min_paragraph_len, max_paragraph_len, 100)
    elif book not in ignore_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, top_k_percent, num_k, drop_top, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    else:
        top_k = []

    for i, sim in top_k:
        par_trans_dict = aligned_paragraph_dataset[book][i]
        par_source = source_paragraph_dataset[book][i]

        for translator, t in par_trans_dict.items():
            if translator not in ignore_translator:
                t = t.replace('\\\'', '\'')
                datum_dict = {'source':par_source, 'translation': t, 'idx': i, 'book': book, 'sim': sim, 'translator': translator}

                if translator not in translator_to_pars.keys():
                    translator_to_pars[translator] = []
                    translator_to_pars_holdout[translator] = []
                    
                if book in holdout_books:
                    translator_to_pars_holdout[translator].append(datum_dict)
                else:
                    translator_to_pars[translator].append(datum_dict)

### Balancing Holdout and Non-Holdout Data by Translator

Post-filtering, we have a disbalanced distribution of paragraphs per translator. We need to make sure we include the same amount of data per translator.

We know the translator Katz has the least data, so we randomly sample and reduce the number of paragraphs we keep for the rest of the translators.

In [13]:
min_len = len(translator_to_pars['Katz'])
print(min_len)
for t in translator_to_pars.keys():
    keep = sorted(translator_to_pars[t], key=lambda d: d['sim'], reverse=True)[:min_len]
    translator_to_pars[t] = keep

6931


In [14]:
from random import sample

min_len_h = len(translator_to_pars_holdout['Katz'])
print(min_len_h)
for t in translator_to_pars_holdout.keys():
    keep = sample(translator_to_pars_holdout[t], min_len_h) 
    translator_to_pars_holdout[t] = keep

470


Lets take a look at the paragraphs we've kept:

In [15]:
newlist = sorted(translator_to_pars['PV'], key=lambda d: d['sim'], reverse=True) 
newlist[0:5]

[{'source': 'Из лицея молодой человек в первые два года приезжал на вакацию. Во время поездки в Петербург Варвары Петровны и Степана Трофимовича он присутствовал иногда на литературных вечерах, бывавших у мамаши, слушал и наблюдал. Говорил мало и всё по-прежнему был тих и застенчив. К Степану Трофимовичу относился с прежним нежным вниманием, но уже как-то сдержаннее: о высоких предметах и о воспоминаниях прошлого видимо удалялся с ним заговаривать. Кончив курс, он, по желанию мамаши, поступил в военную службу и вскоре был зачислен в один из самых видных гвардейских кавалерийских полков. Показаться мамаше в мундире он не приехал и редко стал писать из Петербурга. Денег Варвара Петровна посылала ему не жалея, несмотря на то что после реформы доход с ее имений упал до того, что в первое время она и половины прежнего дохода не получала. У ней, впрочем, накоплен был долгою экономией некоторый, не совсем маленький капитал. Ее очень интересовали успехи сына в высшем петербургском обществе. Чт

### Checking the lengths and translator distributions that we've kept and comparing the size of the train and holdout datasets. 

We had to play around with the books we kept in the holdout set to maintain a reasonable percentage of train/holdout data.

In [16]:
abs_total = 0
print('\nAll')
for k in translator_to_pars_holdout.keys():
    both = len(translator_to_pars_holdout[k]) + len(translator_to_pars[k])
    print(k, both)
    abs_total += both
print('Total', abs_total)

train_total = 0
min_class = 100000000000
print('\nTrain')
for k in translator_to_pars.keys():
    print(k, len(translator_to_pars[k]))
    if len(translator_to_pars[k]) < min_class:
        min_class = len(translator_to_pars[k])
    
train_total = len(translator_to_pars.keys()) * min_class

holdout_total = 0
min_class_h = 100000000000
print('\nHoldout')
for k in translator_to_pars_holdout.keys():
    print(k, len(translator_to_pars_holdout[k]))
    if len(translator_to_pars_holdout[k]) < min_class_h:
        min_class_h = len(translator_to_pars_holdout[k])

holdout_total = len(translator_to_pars.keys()) * min_class_h

print('Train total: ', min_class*5)
print('Val/Test total: ', min_class_h*5)
print()
print('train % = ', train_total/(holdout_total+train_total))
print('holdout % = ', holdout_total/(holdout_total+train_total))



All
PV 7401
Garnett 7401
Katz 7401
McDuff 7401
Total 29604

Train
PV 6931
Garnett 6931
Katz 6931
McDuff 6931

Holdout
PV 470
Garnett 470
Katz 470
McDuff 470
Train total:  34655
Val/Test total:  2350

train % =  0.9364950682340224
holdout % =  0.06350493176597757


### Prepping Data for Pre-processing for Classification

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(translator_to_pars.keys()))
print(le.transform(["Garnett", "McDuff", "PV", "Katz"]))

[0 2 3 1]


In [18]:
data_list = []
i = 0
for tr in translator_to_pars.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat,  'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list.append(sent_dict)
        i += 1


data_list_holdout = []
i = 0
for tr in translator_to_pars_holdout.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars_holdout[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat, 'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list_holdout.append(sent_dict)
        i += 1
        

In [19]:
import pandas as pd

df = pd.DataFrame(data_list)
df_holdout = pd.DataFrame(data_list_holdout)
df_holdout_X = df_holdout[['idx','book', 'concat', 'translator', 'sim', 'src', 'tgt']]

df.head()

Unnamed: 0,idx,book,labels,concat,translator,sim,src,tgt
0,91,Demons,3,Из лицея молодой человек в первые два года при...,PV,0.976574,Из лицея молодой человек в первые два года при...,For the first two years the young man came hom...
1,1263,Demons,3,"– Вещь короткая; даже, если хотите, по-настоящ...",PV,0.976535,"– Вещь короткая; даже, если хотите, по-настоящ...","""It's a short matter; in fact, if you like, it..."
2,1228,Demons,3,– Да кто? Кто велел вам сюда приходить? – допр...,PV,0.976279,– Да кто? Кто велел вам сюда приходить? – допр...,"""But, who? Who told you to come here?"" Varvara..."
3,289,Demons,3,Так называемое у нас имение Степана Трофимович...,PV,0.976099,Так называемое у нас имение Степана Трофимович...,"Stepan Trofimovich's estate, as we used to cal..."
4,528,Demons,3,"– Шатов? Это брат Дарьи Павловны… <SEP> ""Shato...",PV,0.97594,– Шатов? Это брат Дарьи Павловны…,"""Shatov? He is Darya Pavlovna's brother..."""


In [20]:
print(df.shape)
print(df_holdout.shape)

(27724, 8)
(1880, 8)


In [21]:
from sklearn.model_selection import train_test_split

test_texts, val_texts, test_labels, val_labels = train_test_split(
    df_holdout_X, df_holdout['labels'],
    stratify = df_holdout['labels'], shuffle=True, test_size=0.5
)

aligned_train_df = df
test_df = pd.concat([test_texts, test_labels], axis=1)
val_df = pd.concat([val_texts, val_labels], axis=1)
print('train size: ', aligned_train_df.shape)
print('val size: ', val_df.shape)
print('test size: ', test_df.shape)

train size:  (27724, 8)
val size:  (940, 8)
test size:  (940, 8)


In [22]:
# SAVE ALIGNED TRAIN
aligned_train_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/4class_same_holdout_aligned_train_df.pickle")  

# SAVE HOLDOUT VAL
val_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/4class_same_holdout_experiment_val_df.pickle")  

# SAVE HOLDOUT TEST
test_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/4class_same_holdout_experiment_test_df.pickle") 

In [23]:
test_df.sample(20)

Unnamed: 0,idx,book,concat,translator,sim,src,tgt,labels
1855,4484,TheIdiot,"– Так я и порешил, чтоб ни за что, парень, и н...",McDuff,0.87093,"– Так я и порешил, чтоб ни за что, парень, и н...","'That's what I've decided, not on any account,...",2
1760,59,TheIdiot,"– В Сибирь, в Сибирь! Тотчас в Сибирь! <SEP> ""...",McDuff,0.949025,"– В Сибирь, в Сибирь! Тотчас в Сибирь!","""Yes indeed, to Siberia, Siberia! Straight to ...",2
882,2155,TheIdiot,"– Вчера утром, – отрапортовал Келлер, – мы име...",Garnett,0.892784,"– Вчера утром, – отрапортовал Келлер, – мы име...","""Yesterday morning,"" answered Keller, ""we met ...",0
718,2349,TheIdiot,Князь молчал. <SEP> Myshkin did not speak.,Garnett,0.514383,Князь молчал.,Myshkin did not speak.,0
1675,3546,TheIdiot,"– И наверно дошла, – заметил Ганя. <SEP> 'It p...",McDuff,0.723232,"– И наверно дошла, – заметил Ганя.","'It probably has reached her ears,' observed G...",2
711,3683,TheIdiot,"– Философия нужна-с, очень бы нужна была-с в н...",Garnett,0.834038,"– Философия нужна-с, очень бы нужна была-с в н...","""Philosophy would be useful, very useful in ou...",0
1645,501,TheIdiot,"– Но только так, чтобы никто не заметил, – умо...",McDuff,0.829551,"– Но только так, чтобы никто не заметил, – умо...","'Only it must be done so no one notices,' Gany...",2
1547,325,TheIdiot,"– Игумен Пафнутий, – отвечал князь внимательно...",McDuff,0.74025,"– Игумен Пафнутий, – отвечал князь внимательно...","'The Abbot Pafnuty,' the prince replied, with ...",2
253,1655,TheIdiot,"– Ты. Она тебя тогда, с тех самых пор, с имени...",PV,0.927536,"– Ты. Она тебя тогда, с тех самых пор, с имени...","""You. She fell in love with you then, ever sin...",3
81,3625,TheIdiot,– Но вы бог знает что из самого обыкновенного ...,PV,0.857332,– Но вы бог знает что из самого обыкновенного ...,"""But you make God knows what out of a most ord...",3


## Generate Experiment Dataset without Sorting/Filtering

To compare the efficacy of sorting/filtering alignments, we want to compare performance with randomly sampled data with the same dataset size and holdout parameters.

In [24]:
min_paragraph_len = 20
max_paragraph_len = 1000000000000
top_k_percent = 1
num_k = 5000
drop_top = 0.00
align_scale = 100

In [25]:
holdout_books = ['TheIdiot', 'NotesFromUnderground']
ignore_books = []
ignore_translator = ['Hogarth']
translator_to_pars = {}
translator_to_pars_holdout = {}

# for each book in train:
for book in sorted(list(aligned_paragraph_dataset.keys())):
    # get par list of aligned sentences, best k alignments
    book_par_list = [list(aligned_paragraph_dataset[book][p].values()) for p in range(len(aligned_paragraph_dataset[book]))]
    source_par_list = source_paragraph_dataset[book]

    if book in holdout_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, 1.0, 5000, 0, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    elif book not in ignore_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, top_k_percent, num_k, drop_top, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    else:
        top_k = []

    for i, sim in top_k:
        par_trans_dict = aligned_paragraph_dataset[book][i]
        par_source = source_paragraph_dataset[book][i]

        for translator, t in par_trans_dict.items():
            if translator not in ignore_translator:
                t = t.replace('\\\'', '\'')
                datum_dict = {'source':par_source, 'translation': t, 'idx': i, 'book': book, 'sim': sim, 'translator': translator}

                if translator not in translator_to_pars.keys():
                    translator_to_pars[translator] = []
                    translator_to_pars_holdout[translator] = []
                    
                if book in holdout_books:
                    translator_to_pars_holdout[translator].append(datum_dict)
                    # print('len par_list: ', len(book_par_list))
                    # print('len top_k: ', len(top_k))
                else:
                    translator_to_pars[translator].append(datum_dict)

In [26]:
train_entire_dataset = 0
holdout_entire_dataset = 0
for t in translator_to_pars.keys():
    train_entire_dataset += len(translator_to_pars[t])
    holdout_entire_dataset += len(translator_to_pars_holdout[t])

In [27]:
aligned_train_df.shape[0]

27724

In [28]:
min_len = int(aligned_train_df.shape[0]/4)
print(min_len)
for t in translator_to_pars.keys():
    keep = sample(translator_to_pars[t], min_len) 
    translator_to_pars[t] = keep

min_len_h = len(translator_to_pars_holdout['Katz'])
print(min_len_h)
for t in translator_to_pars_holdout.keys():
    keep = sample(translator_to_pars_holdout[t], min_len_h) 
    translator_to_pars_holdout[t] = keep

6931
470


In [29]:
abs_total = 0
print('\nTrain + Holdout')
for k in translator_to_pars_holdout.keys():
    both = len(translator_to_pars_holdout[k]) + len(translator_to_pars[k])
    print(k, both)
    abs_total += both
print('Total', abs_total)

train_total = 0
min_class = 100000000000
print('\nTrain')
for k in translator_to_pars.keys():
    print(k, len(translator_to_pars[k]))
    if len(translator_to_pars[k]) < min_class:
        min_class = len(translator_to_pars[k])
    
train_total = len(translator_to_pars.keys()) * min_class

holdout_total = 0
min_class_h = 100000000000
print('\nHoldout')
for k in translator_to_pars_holdout.keys():
    print(k, len(translator_to_pars_holdout[k]))
    if len(translator_to_pars_holdout[k]) < min_class_h:
        min_class_h = len(translator_to_pars_holdout[k])

holdout_total = len(translator_to_pars.keys()) * min_class_h

print('Train total: ', min_class*5)
print('Val/Test total: ', min_class_h*5)
print()
print('train % = ', train_total/(holdout_total+train_total))
print('holdout % = ', holdout_total/(holdout_total+train_total))
print()
print('entire dataset % = ', (holdout_total+train_total)/(train_entire_dataset + holdout_entire_dataset))
print('entire train % = ', (train_total)/(train_entire_dataset))
print('holdout train % = ', (holdout_total)/(holdout_entire_dataset))


Train + Holdout
PV 7401
Garnett 7401
Katz 7401
McDuff 7401
Total 29604

Train
PV 6931
Garnett 6931
Katz 6931
McDuff 6931

Holdout
PV 470
Garnett 470
Katz 470
McDuff 470
Train total:  34655
Val/Test total:  2350

train % =  0.9364950682340224
holdout % =  0.06350493176597757

entire dataset % =  0.41511021369678613
entire train % =  0.4854661343419486
holdout train % =  0.13231981981981983


In [30]:
data_list = []
i = 0
for tr in translator_to_pars.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat,  'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list.append(sent_dict)
        i += 1

In [31]:
df = pd.DataFrame(data_list)
random_train_df = df
random_train_df.shape

(27724, 8)

In [32]:
# SAVE ALIGNED TRAIN
random_train_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/4class_same_holdout_random_train_df.pickle")  

In [33]:
random_train_df.sample(n=20)

Unnamed: 0,idx,book,labels,concat,translator,sim,src,tgt
20239,2980,CrimeAndPunishment,1,"– Нет, нет; никогда и нигде! – вскрикнула Соня...",Katz,0.860224,"– Нет, нет; никогда и нигде! – вскрикнула Соня...","""No, no; never and nowhere!"" Sonya cried. ""I'l..."
1157,3619,AnnaKarenina,3,"Степан Аркадьич широко открыл свои блестящие, ...",PV,0.918005,"Степан Аркадьич широко открыл свои блестящие, ...","Stepan Arkadyich opened his shining, clear eye..."
16479,4422,TheBrothersKaramazov,1,– Про кого ты говоришь… про Митю? – в недоумен...,Katz,0.847434,– Про кого ты говоришь… про Митю? – в недоумен...,"""Who are you talking about . .. about Mitya?"" ..."
3512,762,AnnaKarenina,3,"– Что делать, придумай, Анна, помоги. Я все пе...",PV,0.853283,"– Что делать, придумай, Анна, помоги. Я все пе...","'What's to be done, think, Anna, help me. I've..."
9608,1135,CrimeAndPunishment,0,– Фатеру по ночам не нанимают; а к тому же вы ...,Garnett,0.463574,– Фатеру по ночам не нанимают; а к тому же вы ...,"and you ought to come up with the porter."""
3153,3557,Demons,3,– Вам-то что за дело? – спросил он вдруг с стр...,PV,0.98546,– Вам-то что за дело? – спросил он вдруг с стр...,"""What business is it of yours?"" he asked sudde..."
27553,5324,TheBrothersKaramazov,2,– А простить не хочешь! – прокричал Митя Груше...,McDuff,0.778375,– А простить не хочешь! – прокричал Митя Груше...,'But you do not want to forgive!' Mitya shoute...
7942,2588,CrimeAndPunishment,0,"– Да не беспокойтесь же о форме, – перебил Пор...",Garnett,0.800496,"– Да не беспокойтесь же о форме, – перебил Пор...","""Don't worry about the form,"" Porfiry interrup..."
5266,1931,CrimeAndPunishment,3,"– Ну, полноте, кто ж у нас на Руси себя Наполе...",PV,0.745844,"– Ну, полноте, кто ж у нас на Руси себя Наполе...","""But, my goodness, who in our Russia nowadays ..."
10236,3188,CrimeAndPunishment,0,"– Да что вы, Родион Романыч, такой сам не свой...",Garnett,0.823364,"– Да что вы, Родион Романыч, такой сам не свой...",Svidrigailov looked intently at Raskolnikov an...
