In [18]:
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer, normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

import pandas as pd 
import numpy as np 
import pickle as pkl
import json

from sklearn.model_selection import train_test_split

# Data Pre-Processing and vocabulary building

## 1 - Data Splitting 

In [10]:
path_to_data = './data/arxiv.csv'
arxiv_dataset = pd.read_csv(path_to_data, index_col=[0])
arxiv_dataset

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,title,abstract
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-..."
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...
...,...,...,...
1934880,supr-con/9608008,On the origin of the irreversibility line in t...,We report on measurements of the angular dep...
1934881,supr-con/9609001,Nonlinear Response of HTSC Thin Film Microwave...,The non-linear microwave surface impedance o...
1934882,supr-con/9609002,Critical State Flux Penetration and Linear Mic...,The vortex contribution to the dc field (H) ...
1934883,supr-con/9609003,Density of States and NMR Relaxation Rate in A...,We show that the density of states in an ani...


In [33]:
abstract = arxiv_dataset['abstract'].to_numpy()
title = arxiv_dataset['title'].to_numpy()

subsample_abstract, _ , subsample_title, _  = train_test_split(abstract, title, test_size = 0.99)

train_abstract, test_abstract, train_title, test_title = train_test_split(subsample_abstract, subsample_title, test_size = 0.2)

train_abstract, dev_abstract, train_title, dev_title = train_test_split(train_abstract, train_title, test_size = 0.2)


train = {'abstract' : train_abstract, 'title': train_title}
test = {'abstract' : test_abstract, 'title': test_title}
dev = {'abstract' : dev_abstract, 'title': dev_title}


train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)
dev_df = pd.DataFrame(dev)

train_df = train_df.head(512)
dev_df = dev_df.head(64)
test_df = test_df.head(64)

In [34]:
path_to_preprocess_data = './data/preprocess/'
pkl.dump(train_df, open(path_to_preprocess_data + 'train.pkl', 'wb'))
pkl.dump(test_df, open(path_to_preprocess_data + 'test.pkl', 'wb'))
pkl.dump(dev_df, open(path_to_preprocess_data + 'dev.pkl', 'wb'))

## 2 - Pre-tokenization, Tokenization and Learn the Vocabulary 

In [35]:
wp_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

wp_tokenizer.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    special_tokens=[
        ("[PAD]", 0),
        ("[SOS]", 1),
        ("[EOS]", 2),
        ("[UNK]", 3),
    ],
)

wp_tokenizer.pre_tokenizer = Whitespace()
wp_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
wp_tokenizer.decoder = decoders.WordPiece()

In [36]:
wp_trainer = WordPieceTrainer(
    vocab_size=30522, special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"]
)

train_df = pkl.load(open(path_to_preprocess_data + 'train.pkl', 'rb'))

train = np.concatenate((train_df['abstract'].to_numpy(), train_df['title'].to_numpy()), axis = 0)
train

wp_tokenizer.train_from_iterator(train, wp_trainer)
wp_tokenizer.save("./data/preprocess/vocab.json")

17454