Reads data from training data files and converts it to single data.csv with positive and negative samples.

In [None]:
import pandas as pd
import os
import pickle

In [None]:
lang = 'ru'

lang_root = 'bucc2018/{}-en/'.format(lang)
map_file_prefix = 'dict_bucc2018_{}-en'.format(lang)

data_source_map = pd.read_csv(os.path.join(lang_root, '{}-en.training.ru'.format(lang)), sep='\t', names=['id', 'text'])
data_target_map = pd.read_csv(os.path.join(lang_root, '{}-en.training.en'.format(lang)), sep='\t', names=['id', 'text'])

# key = sentence id, value = sentence text
data_source_map = data_source_map.set_index('id')['text'].to_dict()
data_target_map = data_target_map.set_index('id')['text'].to_dict()

def save_map(data_map, name):
    with open('{}_{}.pkl'.format(map_file_prefix, name), 'wb') as f:
        pickle.dump(data_map, f, protocol=pickle.HIGHEST_PROTOCOL)
        
save_map(data_source_map, 'source')
save_map(data_target_map, 'target')

gold = pd.read_csv(os.path.join(lang_root, '{}-en.training.gold'.format(lang)), sep='\t', names=['source', 'target'])

In [None]:
# Positive samples

POSITIVE_TAG = 1
NEGATIVE_TAG = 0

positive_data = []
error_count = 0
for _, row in gold.iterrows():
    try:
        positive_data.append((data_source_map[row.source], data_target_map[row.target], POSITIVE_TAG))
    except KeyError:
        error_count += 1

print('Errors: {:.2f}%'.format(error_count * 100 / len(data_source_map)))
print('Positive data size: {}'.format(len(positive_data)))

Errors: 0.09%
Positive data size: 14029


In [None]:
# To get negative samples, we will pick random negative target for each source.
# Positive and negative samples might contain the same sentences.

import random

random_targets = random.sample(list(data_target_map.values()), len(positive_data))
negative_data = list(zip(list(data_source_map.values())[:len(positive_data)], 
                         random_targets, [NEGATIVE_TAG] * len(positive_data)))
assert len(positive_data) == len(negative_data)

In [None]:
data = positive_data + negative_data
random.shuffle(data)
data = pd.DataFrame(data, columns=['source', 'target', 'label'])
data.head()

Unnamed: 0,source,target,label
0,До окончательного подчинения Астраханского хан...,"But it's never been the case, and I'm not a Su...",0
1,В декабре 1919 штабс-ротмистр Червинский предп...,These mutualisms could lead to a decline in bo...,0
2,"По некоторым оценкам, около 20% мировых запасо...",Some estimates suggest that about 20% of the w...,1
3,В результате кризиса 1997 года была признана н...,"Following the 1997 crisis, there was a consens...",1
4,Импортируются в страну некоторые зерновые и ма...,"General Michel Sulaiman added, that victory wa...",0


In [None]:
data.to_csv('data_{}-en.csv'.format(lang), index=False)