# 1. Data ingestion
- `get_metadata()` is a python generator that will read a record of data at a time (avoiding reading whole data into memory at one time)
- `category_map` is a mapping from paper's categories abbreviation into its full name. The mapping is collected by scrapping from Arxiv official website.

In [2]:
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
import requests
import json
import sys
import re
import gc
import os

gc.enable()
print(tf.__version__)

data_path = os.getenv("DATA_PATH")

## RAW DATA
def get_metadata():
    with open(data_path, 'r') as f:
        for line in f:
            yield line

if not os.path.exists('abstracts'): os.mkdir('abstracts')

## CATEGORY MAPPING
category_map = requests.get("https://raw.githubusercontent.com/samsatp/topimo-v2/main/data/mapping.json")
category_map = category_map.json()
category_map_val = list(category_map.values())

2.6.0


In [3]:
metadata = get_metadata()

`getContent()` is a function to check the quality of each data record and extract only necessary part.

In [4]:
def getContent(paper):
    paper = json.loads(paper)
    try:
        paper["title"]
        paper["abstract"]
        cats = paper["categories"].split()
        for cat in cats:
            assert cat in category_map
    except:
        return False, False
    return paper["abstract"], cats

The `for` loop below iterate through all records in data and collect only necessary information.

In [5]:
## STORE DATASET
categories = []    # Each paper's categories
paths = []         # Path to file abstract saved
target = []        # 0,1 

len_abstract = []  # Store len of each abstract (just for viz)

    
for ind, paper in tqdm(enumerate(metadata)):
    if ind%100 > 5: continue
    
    abstract, paperCategories = getContent(paper) 
    if not abstract: continue
    
    ## WHETHER OR NOT THIS PAPER WILL COSIDERED TRUE
    isTrue = np.random.uniform() > 0.4
    

    ## ABSRTACT
    abstract = abstract.strip()
    abstract = re.sub("\n", "<br>", abstract)
        ## SAVE DATASET TO DISK
    path = f"abstracts/{ind}.txt"
    paths.append(path)
    with open(path,"w") as f:
        f.writelines(abstract)
    
    
    ## CATEGORIES & TARGET
    if isTrue:  
        paper_cat = [category_map[e] for e in paperCategories]
        target.append(int(1))
    else:       
        n_cat = np.random.uniform(low=1, high=8)
        paper_cat = np.random.choice(category_map_val, size=int(n_cat), replace=False)
        target.append(int(0))
    categories.append(paper_cat)
    
    # VIZ
    len_abstract.append(sum([len(e) for e in abstract]))

0it [00:00, ?it/s]

Collected data distribution

In [6]:
print(f'total data: {len(target)}')
print(f'target distribution 1: {sum(target)} , 0: {len(target)-sum(target)}')

total data: 108460
target distribution 1: 65241 , 0: 43219


In [14]:
import zipfile
    
def zipfolder(foldername, target_dir):            
    zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)
    rootlen = len(target_dir) + 1
    for base, dirs, files in os.walk(target_dir):
        for file in files:
            fn = os.path.join(base, file)
            zipobj.write(fn, fn[rootlen:])

zipfolder('saved_abstracts', 'abstracts')

# 2. Preprocessing data
- `categories` = Ragged matrix where each row is each paper, and each column represents each category of the corresponding paper.
- `categoris_VocabRepo` = flatten `categories` (List[str] each str represents each category)
- `categories_squeeze` = joined `categories` (List[str] each str represents all categories in a paper joined together)

In [6]:
## Find the max len of categories
max_len_cat = max([len(e) for e in categories])

categories = [list(e) for e in categories]  # Make sure `categories` is a List[List[str]] 
categoris_VocabRepo = []                    # categoris_VocabRepo = List[str] containing all categories in 1 level
for c in categories:
    categoris_VocabRepo.extend(c)

vect = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", split=' ')
vect.fit_on_texts(categoris_VocabRepo)      # Fit on all vocabs of categories

vocab = json.loads(vect.get_config()['index_word'])  # Grab dict[index] = word
vocab_size_cat = len(vocab)                          # Find the vocab size

categories_squeeze = [' '.join(e) for e in categories]  # ' '.join(all categories of each paper) into 1 long string

tokenized = vect.texts_to_sequences(categories_squeeze)
tokenized = tf.keras.preprocessing.sequence.pad_sequences(tokenized, padding='post')

### Build the Tensorflow dataset

In [10]:
BATCH_SIZE = 32

def splito(text):
    texts = tf.strings.split(text, sep="<br>")
    return texts

## INPUTS
categories_dataset = tf.data.Dataset.from_tensor_slices(tokenized)

abstract_dataset = tf.data.TextLineDataset(paths)
abstract_dataset = abstract_dataset.map(lambda x:splito(x))

inputs_datasets = tf.data.Dataset.zip((abstract_dataset, categories_dataset))

## TARGETS
target_dataset = tf.data.Dataset.from_tensor_slices(target)

## FINAL DATASET
datasets = tf.data.Dataset.zip((inputs_datasets, target_dataset))
datasets = datasets.padded_batch(32)

Check for the shape of each batch of data.

In [11]:
for inputs, targets in datasets.take(2):
    print(f'inputs[abstract].shape = {inputs[0].shape}')
    print(f'inputs[categories].shape = {inputs[1].shape}')
    print()

inputs[abstract].shape = (32, 21)
inputs[categories].shape = (32, 29)

inputs[abstract].shape = (32, 19)
inputs[categories].shape = (32, 29)



# 3. Universal Sentence Encoder

In [13]:
from tensorflow.keras.layers import GRU, Dense, Embedding
from typing import List, Union
import tensorflow_text as text
import tensorflow_hub as hub
import sentencepiece

embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-large/5", trainable=False)

Check for the shape of some operation used in the model.

In [15]:
a = embed(inputs[0][0])
print(f'embed(a_record).shape = {a.shape}')

b = tf.map_fn(lambda x:embed(x), elems=inputs[0], fn_output_signature=tf.float32)
print(f'embed(a_batch).shape  = {b.shape}')

embed(a_record).shape = (19, 512)
embed(a_batch).shape  = (32, 19, 512)


In [16]:
class UseZeroClf(tf.keras.Model):
    def __init__(self, gru_units: int, dense_units: List[int], category_vocab_size: int, emb_dim: int):
        super().__init__()
        self.gru_units = gru_units
        self.gru = GRU(units=gru_units, dropout=0.1)
        self.denses = [
            Dense(units=unit, activation='relu') for unit in dense_units
        ]
        self.n_denses = len(dense_units)
        self.final_dense = Dense(units=1)
        self.embedding = Embedding(category_vocab_size+1, emb_dim, mask_zero=True)
    
    def call(self, inputs, training):
        docs, cat_ids = inputs
        batch_size = len(docs)

        embedded_sentences = tf.map_fn(fn=lambda x:embed(x), elems=docs, fn_output_signature=tf.float32)  # (batch_size, padded_batch_size, 512)
        gru_outputs = self.gru(embedded_sentences)    # (batch_size, gru_units)

        all_emb = self.embedding(cat_ids)
        avg_emb = tf.reduce_mean(all_emb, axis=1)    # (batch_size, emb_dim)
         
        output = tf.concat([gru_outputs, avg_emb], axis=-1)  # (batch_size, emb_dim+gru_units)
        
        for i in range(self.n_denses):
            output = self.denses[i](output)
            
        output = self.final_dense(output)
        
        return output

In [17]:
model = UseZeroClf(4, [10], vocab_size_cat, 12)

# 4 Training loop

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_object, metrics=['accuracy'])

In [20]:
## Test model
for inputs, y_true in datasets.take(1):
    p = model(inputs)
print(f'output.shape: {p.shape}')
print(loss_object(y_true, p).numpy())

output.shape: (32, 1)
0.68775654


In [None]:
model.fit(datasets, epochs=5, steps_per_epoch=len(target) // 32)