In [1]:
import numpy as np
import pandas as pd
import json
import pickle
from topics import topic_num_map

In [2]:
topics = list(topic_num_map.keys())

df = pd.DataFrame({'label': topics})


df['first_label'] = df['label'].apply(lambda x: x.split('.')[0])
df['second_label'] = df['label'].apply(lambda x: x.split('.')[1] if len(x.split('.')) > 1 else None)

# change the first_label and second_label for the following labels
# 'quant-ph' -> 'physics', 'quant-ph'
# 'cmp-lg' -> 'cs', 'cmp-lg'

df.loc[df['label'] == 'quant-ph', 'first_label'] = 'quant-ph'
df.loc[df['label'] == 'quant-ph', 'second_label'] = 'quant-ph'
df.loc[df['label'] == 'cmp-lg', 'first_label'] = 'cmp-lg'
df.loc[df['label'] == 'cmp-lg', 'second_label'] = 'cmp-lg'
df

Unnamed: 0,label,first_label,second_label
0,cs.it,cs,it
1,math.it,math,it
2,cs.lg,cs,lg
3,cs.ai,cs,ai
4,stat.ml,stat,ml
5,cs.ds,cs,ds
6,cs.si,cs,si
7,cs.dm,cs,dm
8,physics.soc-ph,physics,soc-ph
9,cs.lo,cs,lo


In [3]:
# print the unqiue first labels and second labels
print(df['first_label'].unique(), len(df['first_label'].unique()))
print(df['second_label'].unique(), len(df['second_label'].unique()))
first_labels = df['first_label'].unique()
second_labels = df['second_label'].unique()

['cs' 'math' 'stat' 'physics' 'quant-ph' 'cmp-lg' 'cond-mat' 'q-bio'
 'nlin'] 9
['it' 'lg' 'ai' 'ml' 'ds' 'si' 'dm' 'soc-ph' 'lo' 'co' 'cc' 'oc' 'ni' 'cv'
 'cl' 'cr' 'sy' 'dc' 'ne' 'ir' 'quant-ph' 'gt' 'cy' 'pl' 'se' 'pr' 'db'
 'cg' 'na' 'hc' 'ce' 'ma' 'ro' 'fl' 'st' 'th' 'dl' 'cmp-lg' 'mm'
 'stat-mech' 'pf' 'ap' 'ms' 'me' 'sc' 'dis-nn' 'nc' 'data-an' 'ao' 'qm'
 'nt'] 51


In [4]:
# request the content from https://arxiv.org/list/{first_label}

import requests
from bs4 import BeautifulSoup

def get_label_text(row):
    first_label = row['first_label']
    label = row['label']

    req_url = 'https://arxiv.org/list/{}'
    r = requests.get(req_url.format(first_label))
    soup = BeautifulSoup(r.text, 'html.parser')

    # get the title that's enclosed by <title> tag
    parent_tag = soup.title.text.replace('authors/titles', '').strip('\"' ).strip()

    r = requests.get(req_url.format(label))
    soup = BeautifulSoup(r.text, 'html.parser')
    child_tag = soup.title.text.replace('authors/titles', '').strip('\"' ).strip()

    return parent_tag, child_tag

In [5]:
df['Parent_label'] = df.apply(lambda x : get_label_text(x)[0], axis=1)
df['Child_label'] = df.apply(lambda x : get_label_text(x)[1], axis=1)

df

KeyboardInterrupt: 

In [6]:
# construct the taxonomy
hiera = {}

# for first_label in first_labels:
#     hiera[first_label] = set()
    
#     # get the unique labels with the first_label
#     sub_df = df[df['first_label'] == first_label].unique()

labels_set = {}
count = 0
for first_label in first_labels:
    labels_set[first_label] = count
    count += 1

for label in df['label'].unique():
    if not label in labels_set:
        labels_set[label] = count
        count += 1

labels_set
# save as label_dict.pkl
with open('new_label_dict.pkl', 'wb') as f:
    pickle.dump(labels_set, f)

In [7]:
# help to convert the label to the internal label
internal_label_mapping = {}

for k, v in topic_num_map.items():
    internal_label_mapping[v] = {k, k.split('.')[0]}
internal_label_mapping

{0: {'cs', 'cs.it'},
 1: {'math', 'math.it'},
 2: {'cs', 'cs.lg'},
 3: {'cs', 'cs.ai'},
 4: {'stat', 'stat.ml'},
 5: {'cs', 'cs.ds'},
 6: {'cs', 'cs.si'},
 7: {'cs', 'cs.dm'},
 8: {'physics', 'physics.soc-ph'},
 9: {'cs', 'cs.lo'},
 10: {'math', 'math.co'},
 11: {'cs', 'cs.cc'},
 12: {'math', 'math.oc'},
 13: {'cs', 'cs.ni'},
 14: {'cs', 'cs.cv'},
 15: {'cs', 'cs.cl'},
 16: {'cs', 'cs.cr'},
 17: {'cs', 'cs.sy'},
 18: {'cs', 'cs.dc'},
 19: {'cs', 'cs.ne'},
 20: {'cs', 'cs.ir'},
 21: {'quant-ph'},
 22: {'cs', 'cs.gt'},
 23: {'cs', 'cs.cy'},
 24: {'cs', 'cs.pl'},
 25: {'cs', 'cs.se'},
 26: {'math', 'math.pr'},
 27: {'cs', 'cs.db'},
 28: {'cs', 'cs.cg'},
 29: {'cs', 'cs.na'},
 30: {'cs', 'cs.hc'},
 31: {'math', 'math.na'},
 32: {'cs', 'cs.ce'},
 33: {'cs', 'cs.ma'},
 34: {'cs', 'cs.ro'},
 35: {'cs', 'cs.fl'},
 36: {'math', 'math.st'},
 37: {'stat', 'stat.th'},
 38: {'cs', 'cs.dl'},
 39: {'cmp-lg'},
 40: {'cs', 'cs.mm'},
 41: {'cond-mat', 'cond-mat.stat-mech'},
 42: {'cs', 'cs.pf'},
 43: {'

In [8]:
# create hiera
hiera = {}
for first_label in first_labels:
    if first_label in ['quant-ph', 'cmp-lg']:
        continue
    hiera[first_label] = set()
    # get the unique labels with the first_label
    sub_df = df[df['first_label'] == first_label]
    for _, row in sub_df.iterrows():
        hiera[first_label].add(row['label'])
hiera['Root'] = set(first_labels)

# save as aapd.taxonomy 
with open('aapd.taxonomy', 'w') as f:
    for k, v in hiera.items():
        f.write('{}\t{}\n'.format(k, '\t'.join(v)))

In [9]:
hiera

{'cs': {'cs.ai',
  'cs.cc',
  'cs.ce',
  'cs.cg',
  'cs.cl',
  'cs.cr',
  'cs.cv',
  'cs.cy',
  'cs.db',
  'cs.dc',
  'cs.dl',
  'cs.dm',
  'cs.ds',
  'cs.fl',
  'cs.gt',
  'cs.hc',
  'cs.ir',
  'cs.it',
  'cs.lg',
  'cs.lo',
  'cs.ma',
  'cs.mm',
  'cs.ms',
  'cs.na',
  'cs.ne',
  'cs.ni',
  'cs.pf',
  'cs.pl',
  'cs.ro',
  'cs.sc',
  'cs.se',
  'cs.si',
  'cs.sy'},
 'math': {'math.co',
  'math.it',
  'math.lo',
  'math.na',
  'math.nt',
  'math.oc',
  'math.pr',
  'math.st'},
 'stat': {'stat.ap', 'stat.me', 'stat.ml', 'stat.th'},
 'physics': {'physics.data-an', 'physics.soc-ph'},
 'cond-mat': {'cond-mat.dis-nn', 'cond-mat.stat-mech'},
 'q-bio': {'q-bio.nc', 'q-bio.qm'},
 'nlin': {'nlin.ao'},
 'Root': {'cmp-lg',
  'cond-mat',
  'cs',
  'math',
  'nlin',
  'physics',
  'q-bio',
  'quant-ph',
  'stat'}}

In [10]:
import torch
slot = {labels_set[k]: {labels_set[_v] for _v in v} for k, v in hiera.items()}
# save as slot.pt
# torch.save(slot, 'slot.pt')

  from .autonotebook import tqdm as notebook_tqdm


KeyError: 'Root'

In [62]:
parent_label_mapping = df[['first_label', 'Parent_label']].set_index('first_label').to_dict()['Parent_label']
child_label_mapping = df[['label', 'Child_label']].set_index('label').to_dict()['Child_label']

# combine the parent_label_mapping and child_label_mapping
label_mapping = {**parent_label_mapping, **child_label_mapping}

# save as label_mapping.json
with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f)

In [83]:
from tqdm import tqdm
for mode in ['train', 'val', 'test']:
    text_path = f"./data/text_{mode}"
    label_path = f"./data/label_{mode}"
    texts = []
    labels = []
    output = []
    with open(text_path, 'r') as f:
        for line in f:
            texts.append(line.strip())
    with open(label_path, 'r') as f:
        for line in f:
            labels.append(line.strip())

    for label, text in tqdm(zip(labels, texts), total=len(labels)):
        _label = label.split(' ')
        _label = [internal_label_mapping[topic_num_map[str.lower(l)]] for l in _label]
        # flat the _label which is a list of set as set
        _label = list(set.union(*_label))
        output.append({'token': text, 'label': _label})

    # print(len(output))
    with open(f'./{mode}.jsonl', 'w') as f:
        for line in output:
            json.dump(line, f)
            f.write('\n')

100%|██████████| 53840/53840 [00:00<00:00, 544100.58it/s]
100%|██████████| 1000/1000 [00:00<00:00, 935810.80it/s]
100%|██████████| 1000/1000 [00:00<00:00, 951952.79it/s]


In [77]:
len(output)

1000

In [92]:
import sys
sys.path.append('../../')
from utils import get_hierarchy_info

hiera, _label_dict, r_hiera, label_depth = get_hierarchy_info('aapd.taxonomy')
r_hiera

{'cs.cr': 'cs',
 'cs.cc': 'cs',
 'cs.mm': 'cs',
 'cs.sy': 'cs',
 'cs.cy': 'cs',
 'cs.ro': 'cs',
 'cs.ai': 'cs',
 'cs.pf': 'cs',
 'cs.na': 'cs',
 'cs.ds': 'cs',
 'cs.dl': 'cs',
 'cs.ir': 'cs',
 'cs.hc': 'cs',
 'cs.gt': 'cs',
 'cs.fl': 'cs',
 'cs.it': 'cs',
 'cs.se': 'cs',
 'cs.dm': 'cs',
 'cs.ni': 'cs',
 'cs.db': 'cs',
 'cs.sc': 'cs',
 'cs.lg': 'cs',
 'cs.cg': 'cs',
 'cs.ms': 'cs',
 'cs.dc': 'cs',
 'cs.cl': 'cs',
 'cs.si': 'cs',
 'cs.pl': 'cs',
 'cs.ne': 'cs',
 'cs.ce': 'cs',
 'cs.lo': 'cs',
 'cs.cv': 'cs',
 'cs.ma': 'cs',
 'math.nt': 'math',
 'math.na': 'math',
 'math.oc': 'math',
 'math.st': 'math',
 'math.co': 'math',
 'math.lo': 'math',
 'math.pr': 'math',
 'math.it': 'math',
 'stat.me': 'stat',
 'stat.th': 'stat',
 'stat.ml': 'stat',
 'stat.ap': 'stat',
 'physics.data-an': 'physics',
 'physics.soc-ph': 'physics',
 'cond-mat.dis-nn': 'cond-mat',
 'cond-mat.stat-mech': 'cond-mat',
 'q-bio.qm': 'q-bio',
 'q-bio.nc': 'q-bio',
 'nlin.ao': 'nlin',
 'math': 'Root',
 'quant-ph': 'Root',
 '

In [93]:
from transformers import AutoTokenizer
import os

deta_path = '../nyt/'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

label_dict = torch.load(os.path.join(deta_path, 'bert_value_dict.pt'))
label_dict = {i: tokenizer.decode(v) for i, v in label_dict.items()}
label_dict

{0: 'news',
 1: 'business',
 2: 'new york and region',
 3: 'new jersey',
 4: 'classifieds',
 5: 'paid death notices',
 6: 'connecticut',
 7: 'opinion',
 8: 'opinion',
 9: 'corrections',
 10: 'editorials',
 11: 'sports',
 12: 'pro basketball',
 13: 'world',
 14: 'americas',
 15: 'countries and territories',
 16: 'features',
 17: 'travel',
 18: 'guides',
 19: 'destinations',
 20: 'central and south america',
 21: 'u. s.',
 22: 'health',
 23: 'diseases, conditions, and health topics',
 24: 'europe',
 25: 'europe',
 26: 'france',
 27: 'the city',
 28: 'activities and interests',
 29: 'north america',
 30: 'united states',
 31: 'new york',
 32: 'style',
 33: 'fashion and style',
 34: 'u. s. states, territories and possessions',
 35: 'new york',
 36: 'mid - atlantic',
 37: 'job market',
 38: 'job categories',
 39: 'banking, finance and insurance',
 40: 'new york city',
 41: 'columns',
 42: 'your money',
 43: 'front page',
 44: 'south',
 45: 'education',
 46: 'education',
 47: 'middle east',


In [13]:
import os
with open(os.path.join('./', 'new_label_dict.pkl'), 'rb') as f:
    label_dict = pickle.load(f)
# lable_dict = {v : k for k, v in label_dict.items()}

label_dict

{'cs': 0,
 'math': 1,
 'stat': 2,
 'physics': 3,
 'quant-ph': 4,
 'cmp-lg': 5,
 'cond-mat': 6,
 'q-bio': 7,
 'nlin': 8,
 'cs.it': 9,
 'math.it': 10,
 'cs.lg': 11,
 'cs.ai': 12,
 'stat.ml': 13,
 'cs.ds': 14,
 'cs.si': 15,
 'cs.dm': 16,
 'physics.soc-ph': 17,
 'cs.lo': 18,
 'math.co': 19,
 'cs.cc': 20,
 'math.oc': 21,
 'cs.ni': 22,
 'cs.cv': 23,
 'cs.cl': 24,
 'cs.cr': 25,
 'cs.sy': 26,
 'cs.dc': 27,
 'cs.ne': 28,
 'cs.ir': 29,
 'cs.gt': 30,
 'cs.cy': 31,
 'cs.pl': 32,
 'cs.se': 33,
 'math.pr': 34,
 'cs.db': 35,
 'cs.cg': 36,
 'cs.na': 37,
 'cs.hc': 38,
 'math.na': 39,
 'cs.ce': 40,
 'cs.ma': 41,
 'cs.ro': 42,
 'cs.fl': 43,
 'math.st': 44,
 'stat.th': 45,
 'cs.dl': 46,
 'cs.mm': 47,
 'cond-mat.stat-mech': 48,
 'cs.pf': 49,
 'math.lo': 50,
 'stat.ap': 51,
 'cs.ms': 52,
 'stat.me': 53,
 'cs.sc': 54,
 'cond-mat.dis-nn': 55,
 'q-bio.nc': 56,
 'physics.data-an': 57,
 'nlin.ao': 58,
 'q-bio.qm': 59,
 'math.nt': 60}

In [15]:
value_dict = {v: k for k, v in label_dict.items()}
# save as value_dict.pt
torch.save(value_dict, 'value_dict.pt')

In [114]:
data_path = '../aapd/'
hiera, label_dict, r_hiera, label_depth = get_hierarchy_info(os.path.join(data_path, 'aapd.taxonomy'))
label_dict

{'cs.cc': 0,
 'cs.cr': 1,
 'cs.mm': 2,
 'cs.sy': 3,
 'cs.cy': 4,
 'cs.ro': 5,
 'cs.ai': 6,
 'cs.pf': 7,
 'cs.na': 8,
 'cs.ds': 9,
 'cs.dl': 10,
 'cs.ir': 11,
 'cs.hc': 12,
 'cs.gt': 13,
 'cs.fl': 14,
 'cs.it': 15,
 'cs.se': 16,
 'cs.dm': 17,
 'cs.ni': 18,
 'cs.db': 19,
 'cs.sc': 20,
 'cs.lg': 21,
 'cs.cg': 22,
 'cs.ms': 23,
 'cs.dc': 24,
 'cs.cl': 25,
 'cs.si': 26,
 'cs.pl': 27,
 'cs.ne': 28,
 'cs.ce': 29,
 'cs.lo': 30,
 'cs.cv': 31,
 'cs.ma': 32,
 'math.nt': 33,
 'math.na': 34,
 'math.oc': 35,
 'math.st': 36,
 'math.co': 37,
 'math.lo': 38,
 'math.pr': 39,
 'math.it': 40,
 'stat.me': 41,
 'stat.ap': 42,
 'stat.ml': 43,
 'stat.th': 44,
 'physics.data-an': 45,
 'physics.soc-ph': 46,
 'cond-mat.dis-nn': 47,
 'cond-mat.stat-mech': 48,
 'q-bio.qm': 49,
 'q-bio.nc': 50,
 'nlin.ao': 51,
 'math': 52,
 'quant-ph': 53,
 'physics': 54,
 'q-bio': 55,
 'nlin': 56,
 'stat': 57,
 'cs': 58,
 'cond-mat': 59,
 'cmp-lg': 60}

In [112]:
r_hiera

{'Nonfiction': 'Root',
 'Children’s Books': 'Root',
 'Teen & Young Adult': 'Root',
 'Fiction': 'Root',
 'Humor': 'Root',
 'Classics': 'Root',
 'Poetry': 'Root',
 'Literary Collections': 'Classics',
 'Literary Criticism': 'Classics',
 'Fiction Classics': 'Classics',
 'Nonfiction Classics': 'Classics',
 'Teen & Young Adult Fiction': 'Teen & Young Adult',
 'Teen & Young Adult Fantasy Fiction': 'Teen & Young Adult',
 'Teen & Young Adult Historical Fiction': 'Teen & Young Adult',
 'Teen & Young Adult Romance': 'Teen & Young Adult',
 'Teen & Young Adult Social Issues': 'Teen & Young Adult',
 'Teen & Young Adult Nonfiction': 'Teen & Young Adult',
 'Teen & Young Adult Science Fiction': 'Teen & Young Adult',
 'Teen & Young Adult Mystery & Suspense': 'Teen & Young Adult',
 'Teen & Young Adult Action & Adventure': 'Teen & Young Adult',
 'Children’s Middle Grade Books': 'Children’s Books',
 'Children’s Boxed Sets': 'Children’s Books',
 'Children’s Board Books': 'Children’s Books',
 'Childrens Medi