In [9]:
import pandas as pd
import numpy as np
import json
import os

In [10]:
train_path = './aapd_train.json'
dev_path = './aapd_val.json'
test_path = './aapd_test.json'

train_data, val_data, test_data = [], [], []
with open(train_path) as fd:
    for line in fd.readlines():
        train_data.append(json.loads(line))
with open(dev_path) as fd:
    for line in fd.readlines():
        val_data.append(json.loads(line))
with open(test_path) as fd:
    for line in fd.readlines():
        test_data.append(json.loads(line))
        

In [11]:
train_data[0]

{'token': ['relation',
  'pearson',
  "'s",
  'correlation',
  'coefficient',
  'salton',
  "'s",
  'cosine',
  'measure',
  'revealed',
  'based',
  'different',
  'possible',
  'values',
  'division',
  'l1',
  'norm',
  'l2',
  'norm',
  'vector',
  'different',
  'values',
  'yield',
  'sheaf',
  'increasingly',
  'straight',
  'lines',
  'form',
  'together',
  'cloud',
  'points',
  'investigated',
  'relation',
  'theoretical',
  'results',
  'tested',
  'author',
  'co',
  'citation',
  'relations',
  'among',
  '24',
  'informetricians',
  'two',
  'matrices',
  'constructed',
  'based',
  'co',
  'citations',
  'asymmetric',
  'occurrence',
  'matrix',
  'symmetric',
  'co',
  'citation',
  'matrix',
  'examples',
  'completely',
  'confirm',
  'theoretical',
  'results',
  'results',
  'enable',
  'us',
  'specify',
  'algorithm',
  'provides',
  'threshold',
  'value',
  'cosine',
  'none',
  'corresponding',
  'pearson',
  'correlations',
  'would',
  'negative',
  'using'

In [3]:
labels = []
for data in train_data:
    labels.append(len(data['label']))
for data in val_data:
    labels.append(len(data['label']))
for data in test_data:
    labels.append(len(data['label']))

print('max label length: ', max(labels))
print('min label length: ', min(labels))
print('mean label length: ', np.mean(labels))

max label length:  13
min label length:  3
mean label length:  4.086515042979943


In [8]:
label_mapping['stat']

'Statistics'

In [9]:
with open('./label_mapping.json', 'r') as fd:
    label_mapping = json.load(fd)


# get the keys that have dupliate vaules that are duplicated in the label_mapping
inv_mapping = {}
for k, v in label_mapping.items():
    inv_mapping[v] = inv_mapping.get(v, [])
    inv_mapping[v].append(k)

# get those keys from the inv_mapping that have more than one values
{k: v for k, v in inv_mapping.items() if len(v) > 1}
# change the label_mapping for cs.cl to CS Computation and Language
label_mapping['cs.cl'] = 'CS - Computation and Language'
label_mapping['cs.it'] = 'CS - Information Theory'
label_mapping['math.it'] = 'Math - Information Theory'
label_mapping['cs.lg'] = 'CS - Machine Learning'
label_mapping['stat.ml'] = 'Statistics - Machine Learning'
label_mapping['cs.na'] = 'CS - Numerical Analysis'
label_mapping['math.na'] = 'Math - Numerical Analysis'
label_mapping['math.st'] = 'Math - Statistics Theory'
label_mapping['stat.th'] = 'Statistics Theory'

In [11]:
# save the label_mapping

with open('./label_mapping.json', 'w') as fd:
    json.dump(label_mapping, fd)

In [5]:
import nltk
import re

def filter_non_tokens(sentence):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)
    
    # Define a regular expression pattern to match punctuation marks
    punctuation_pattern = re.compile(r'^\W*$')
    
    # Filter out punctuation marks
    filtered_tokens = [token for token in tokens if not punctuation_pattern.match(token)]
    
    return filtered_tokens

def filter_stop_words(tokens):    
    # Filter out stop words
    filtered_tokens = [token for token in tokens if token not in nltk.corpus.stopwords.words('english')]
    
    return filtered_tokens

for i in range(len(train_data)):
    train_data[i]['token'] = filter_stop_words(train_data[i]['token'])
    # train_data[i]['token'] = [str.lower(t).replace(',', '').replace('.', '') for t in train_data[i]['token']]
    # train_data[i]['token'] = filter_non_tokens(train_data[i]['token'])
    # train_data[i]['label'] = [label_mapping[l] for l in train_data[i]['label']]

for i in range(len(val_data)):
    val_data[i]['token'] = filter_stop_words(val_data[i]['token'])
    # val_data[i]['token'] = [str.lower(t).replace(',', '').replace('.', '') for t in val_data[i]['token']]

for i in range(len(test_data)):
    test_data[i]['token'] = filter_stop_words(test_data[i]['token'])
    # test_data[i]['token'] = [str.lower(t).replace(',', '').replace('.', '') for t in test_data[i]['token']]
    # test_data[i]['label'] = [label_mapping[l] for l in test_data[i]['label']]

In [6]:
# dump them into json files
with open('./aapd_train.json', 'w') as fd:
    for data in train_data:
        json.dump(data, fd)
        fd.write('\n')

with open('./aapd_val.json', 'w') as fd:
    for data in val_data:
        json.dump(data, fd)
        fd.write('\n')

with open('./aapd_test.json', 'w') as fd:
    for data in test_data:
        json.dump(data, fd)
        fd.write('\n')

In [17]:
with open('./aapd.taxonomy', 'r') as fd:
    taxonomy = fd.readlines()

for i in range(len(taxonomy)):
    taxonomy[i] = '\t'.join([label_mapping[l] if l != 'Root' else l for l in taxonomy[i].strip('\n').split('\t')])

# dump the taxonomy into a file
with open('./aapd.taxonomy', 'w') as fd:
    for line in taxonomy:
        fd.write(line + '\n')

In [8]:
from collections import Counter
labels = []
vocabs = []

for data in train_data:
    labels.extend(data['label'])
    vocabs.extend(data['token'])

# count the frequency of each label and vocab
label_count = Counter(labels)
vocab_count = Counter(vocabs)

# sort the label and vocab by their frequency
label_count = sorted(label_count.items(), key=lambda x: x[1], reverse=True)
vocab_count = sorted(vocab_count.items(), key=lambda x: x[1], reverse=True)
# save as vocab_aapd/label.dict and vocab_aapd/vocab.dict

if not os.path.exists('./vocab_aapd'):
    os.mkdir('./vocab_aapd')

with open('./vocab_aapd/label.dict', 'w') as fd:
    for label, count in label_count:
        fd.write(label + '\t' + str(count) + '\n')

with open('./vocab_aapd/vocab.dict', 'w') as fd:
    for vocab, count in vocab_count:
        fd.write(vocab + '\t' + str(count) + '\n')