In [1]:
import os
import numpy as np
from tqdm import tqdm

DATA_DIR = '../../datasets/AAPD'

In [31]:
def get_docs(filepath):
    doc = open(filepath, 'r', encoding='utf-8')
    docs = [text.strip() for text in doc.readlines()]
    return docs

def get_labels(filepath):
    label_doc = open(filepath, 'r', encoding='utf-8')
    labels = [label_line.strip().split()
              for label_line in label_doc.readlines()]
    return labels

In [69]:
train_docs = get_docs(os.path.join(DATA_DIR, 'text_train'))
valid_docs = get_docs(os.path.join(DATA_DIR, 'text_val'))
test_docs = get_docs(os.path.join(DATA_DIR, 'text_test'))

train_labels = get_labels(os.path.join(DATA_DIR, 'label_train'))
valid_labels = get_labels(os.path.join(DATA_DIR, 'label_val'))
test_labels = get_labels(os.path.join(DATA_DIR, 'label_test'))

In [73]:
(len(train_docs), len(valid_docs), len(test_docs),
 len(train_labels), len(valid_labels),  len(test_labels))

(53840, 1000, 1000, 53840, 1000, 1000)

In [35]:
labels_set = set()
for labels in train_labels:
    labels_set |= set(labels)
len(labels_set)

54

In [36]:
label_names = np.array(list(labels_set))
label_to_index = dict(zip(label_names, np.arange(len(label_names))))

In [70]:
frequences = [0] * len(label_to_index)
for ind in range(len(train_labels)):
    for label_index in train_labels[ind]:
        frequences[label_to_index[label_index]] += 1
        
index_to_frequency_index = dict(zip(np.argsort(frequences)[::-1],
                                    np.arange(len(frequences))))
frequency_indexes = [index_to_frequency_index[label_to_index[label]]
                     for label in label_names]
label_to_index = dict(zip(label_names, frequency_indexes))
index_to_label = dict(zip(frequency_indexes, label_names))

labels_train = []
for labels in train_labels:
    labels_train.append(sorted(
        [label_to_index[label] for label in labels]))
    
labels_valid = []
for labels in valid_labels:
    labels_valid.append(sorted(
        [label_to_index[label] for label in labels]))
    
labels_test = []
for labels in test_labels:
    labels_test.append(sorted(
        [label_to_index[label] for label in labels]))

In [88]:
index_to_label_str = {str(index): label for index, label in index_to_label.items()}

In [90]:
import json
with open(os.path.join(DATA_DIR, 'index_to_label.json'), 'w') as outfile:
    json.dump(index_to_label_str, outfile)

In [67]:
import pandas as pd

In [71]:
df_train = pd.DataFrame({
    'text': train_docs,
    'labels': labels_train
})

df_valid = pd.DataFrame({
    'text': valid_docs,
    'labels': labels_valid
})

df_test = pd.DataFrame({
    'text': test_docs,
    'labels': labels_test
})

In [72]:
df_train

Unnamed: 0,text,labels
0,the relation between pearson 's correlation co...,"[20, 45]"
1,the present work studies quantum and classical...,"[0, 1, 22]"
2,one of the most important tasks in image proce...,"[15, 44]"
3,frequency diverse \( fd \) radar waveforms are...,"[0, 1, 44]"
4,unsupervised word embeddings have been shown t...,"[3, 14]"
5,a discrete time wiener phase noise channel wit...,"[0, 1]"
6,"in this paper , we consider a particular class...","[0, 1]"
7,the gallager bound is well known in the area o...,"[0, 1]"
8,symmetric tensor operations arise in a wide va...,"[32, 46]"
9,in a basic related key attack against a block ...,"[0, 1, 16, 22]"


In [74]:
df_train.to_csv(os.path.join(DATA_DIR, 'train.csv'),
                encoding='utf-8', index=0)
df_valid.to_csv(os.path.join(DATA_DIR, 'valid.csv'),
                encoding='utf-8', index=0)
df_test.to_csv(os.path.join(DATA_DIR, 'test.csv'),
               encoding='utf-8', index=0)

In [75]:
doc_lengths = [len(train_docs[i].split()) for i in range(len(train_docs))]
np.mean(doc_lengths), np.std(doc_lengths)

(163.4520059435364, 67.62282864261516)