# 1. Data ingestion

In [9]:
from tqdm.notebook import tqdm
import tensorflow as tf
import numpy as np
import requests
import json
import sys
import re
import gc
import os

gc.enable()
print(tf.__version__)

FullData_PATH = os.getenv("FullData_PATH")

## RAW DATA
def get_metadata():
    with open(FullData_PATH, 'r') as f:
        for line in f:
            yield line

if not os.path.exists('data/abstracts'): os.mkdir('data/abstracts')

## CATEGORY MAPPING
category_map = requests.get("https://raw.githubusercontent.com/samsatp/topimo-v2/main/data/mapping.json")
category_map = category_map.json()
category_map_val = list(category_map.values())

2.6.0


In [10]:
metadata = get_metadata()

In [11]:
def getContent(paper):
    paper = json.loads(paper)
    try:
        paper["title"]
        paper["abstract"]
        cats = paper["categories"].split()
        for cat in cats:
            assert cat in category_map
    except:
        return False, False
    return paper["abstract"], cats

In [12]:
## STORE DATASET
categories = []    # Each paper's categories
paths = []         # Path to file abstract saved
target = []        # 0,1 

len_abstract = []  # Store len of each abstract (just for viz)

    
for ind, paper in tqdm(enumerate(metadata)):
    if ind%100 > 5: continue
    
    abstract, paperCategories = getContent(paper) 
    if not abstract: continue
    
    ## WHETHER OR NOT THIS PAPER WILL COSIDERED TRUE
    isTrue = np.random.uniform() > 0.4
    

    ## ABSRTACT
    abstract = abstract.strip()
    abstract = re.sub("\n", "<br>", abstract)
        ## SAVE DATASET TO DISK
    path = f"abstracts/{ind}.txt"
    paths.append(path)
    with open(path,"w") as f:
        f.writelines(abstract)
    
    
    ## CATEGORIES & TARGET
    if isTrue:  
        paper_cat = [category_map[e] for e in paperCategories]
        target.append(1.)
    else:       
        n_cat = np.random.uniform(low=1, high=8)
        paper_cat = np.random.choice(category_map_val, size=int(n_cat), replace=False)
        target.append(0.)
    categories.append(paper_cat)
    
    # VIZ
    len_abstract.append(sum([len(e) for e in abstract]))

0it [00:00, ?it/s]

In [13]:
print(f'total data: {len(target)}')
print(f'target distribution 1: {sum(target)} , 0:{len(target)-sum(target)}')

total data: 108460
target distribution 1: 64998.0 , 0:43462.0


# 2. Preprocessing data
## 2.1 Abstract text
> Tokenized in sentence level

In [10]:
paths[:2]

['abstracts/0.txt', 'abstracts/1.txt']

In [45]:
def splito(text):
    texts = tf.strings.split(text, sep="<br>")
    return texts

abstract_dataset = tf.data.TextLineDataset(paths)

In [46]:
## Sample data
for i in abstract_dataset.take(1):
    print(i)

tf.Tensor(b'A fully differential calculation in perturbative quantum chromodynamics is<br>presented for the production of massive photon pairs at hadron colliders. All<br>next-to-leading order perturbative contributions from quark-antiquark,<br>gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as<br>all-orders resummation of initial-state gluon radiation valid at<br>next-to-next-to-leading logarithmic accuracy. The region of phase space is<br>specified in which the calculation is most reliable. Good agreement is<br>demonstrated with data from the Fermilab Tevatron, and predictions are made for<br>more detailed tests with CDF and DO data. Predictions are shown for<br>distributions of diphoton pairs produced at the energy of the Large Hadron<br>Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs<br>boson are contrasted with those produced from QCD processes at the LHC, showing<br>that enhanced sensitivity to the signal can be obtained with ju

## 2.2 Categories
> Tokenized in word level

> concat all categories together and their embedding vector are then averaged.

In [25]:
len_cat = [len(e) for e in categories]
max_len_cat = max(len_cat)

categories = [list(e) for e in categories]
categoris_VocabRepo = []
for c in categories:
    categoris_VocabRepo.extend(c)

vect = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", split=' ')
vect.fit_on_texts(categoris_VocabRepo)

In [26]:
vocab = json.loads(vect.get_config()['index_word'])
vocab_size_cat = len(vocab)

vocab_size_cat

204

In [32]:
categories_squeeze = [' '.join(e) for e in categories]
categories_squeeze[:3]

['Rings and Algebras Programming Languages General Mathematics Other Statistics',
 'Combinatorics Computational Geometry',
 'General Physics']

In [34]:
tokenized = vect.texts_to_sequences(categories_squeeze)
tokenized = tf.keras.preprocessing.sequence.pad_sequences(tok, padding='post')

In [37]:
tokenized.shape

(108460, 29)

In [47]:
abstract_dataset = abstract_dataset.batch(32)

categories_dataset = tf.data.Dataset.from_tensor_slices(tokenized)
categories_dataset = categories_dataset.batch(32)

target_dataset = tf.data.Dataset.from_tensor_slices(target)
target_dataset = target_dataset.batch(32)

# 3`Universal Sentence Encoder`

In [38]:
from tensorflow.keras.layers import GRU, Dense, Embedding
from typing import List, Union
import tensorflow_text as text
import tensorflow_hub as hub
import sentencepiece

embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-large/5", trainable=False)

In [104]:
class UseZeroClf(tf.keras.Model):
    def __init__(self, gru_units: int, dense_units: List[int], category_vocab_size: int, emb_dim: int):
        super().__init__()
        self.gru_units = gru_units
        self.gru = GRU(units=gru_units, dropout=0.1)
        self.denses = [
            Dense(units=unit, activation='relu') for unit in dense_units
        ]
        self.n_denses = len(dense_units)
        self.final_dense = Dense(units=1)
        self.embedding = Embedding(category_vocab_size, emb_dim, mask_zero=True)
    
    def call(self, doc: List[str], cat_ids:List[List[int]]):
        
        batch_size = len(doc)
        
        sentences = self.split_doc(doc)
        embedded_sentences = []
        for sentence in sentences:
            embedded_sentence = embed(sentence)
            embedded_sentence = self.gru(tf.expand_dims(embedded_sentence, 0))
            embedded_sentences.append(embedded_sentence)
        
        embedded_sentences = tf.stack(embedded_sentences)
        embedded_sentences = tf.reshape(embedded_sentences, (-1, self.gru_units))   # (batch_size, 512)
            
        all_emb = self.embedding(cat_ids)
        avg_emb = tf.reduce_mean(all_emb, axis=1)    # (batch_size, emb_dim)
         
        
        output = tf.concat([embedded_sentences, avg_emb], axis=-1)  # (batch_size, emb_dim+512)
        
        for i in range(self.n_denses):
            output = self.denses[i](output)
            
        output = self.final_dense(output)
        
        return output
    
    def split_doc(self, text):
        texts = tf.strings.split(text, sep="<br>")
        return texts

In [106]:
## Test model
model = UseZeroClf(7, [10], vocab_size_cat, 12)
for ab, cat in zip(abstract_dataset.take(1), categories_dataset.take(1)):
    p = model(ab, cat)

(32, 7)
