## Imports

In [1]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from ast import literal_eval
import pandas as pd
import numpy as np

## Read data and perform basic EDA

In [67]:
arxiv_data = pd.read_csv(
    "https://github.com/soumik12345/multi-label-text-classification/releases/download/v0.1/arxiv_data.csv"
)
arxiv_data.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [69]:
print(f"There are {len(arxiv_data)} rows in the dataset.")

There are 12999 rows in the dataset.


In [18]:
# There are some terms who occurence is as low as 1.
sum(arxiv_data["terms"].value_counts() == 1)

552

In [19]:
# How many unique terms?
arxiv_data["terms"].nunique()

960

In [20]:
# Filtering the rare terms.
arxiv_data_filtered = arxiv_data.groupby("terms").filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(12447, 3)

## Convert the string labels to list of strings. 

The initial labels are represented as raw strings. Here we make them `List[str]` for a more compact representation. 

In [22]:
arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
    lambda x: literal_eval(x)
)
arxiv_data_filtered["terms"].values[:5]

array([list(['cs.CV', 'cs.LG']), list(['cs.CV', 'cs.AI', 'cs.LG']),
       list(['cs.CV', 'cs.AI']), list(['cs.CV']),
       list(['cs.CV', 'cs.LG'])], dtype=object)

## Stratified splits because of class imbalance

In [23]:
val_split = 0.1

train_df, val_df = train_test_split(
    arxiv_data_filtered,
    test_size=val_split,
    stratify=arxiv_data_filtered["terms"].values,
)

train_df.shape, val_df.shape

((11202, 3), (1245, 3))

## Multi-label binarization

In [27]:
mlb = MultiLabelBinarizer()
mlb.fit_transform(train_df["terms"])
mlb.classes_

array(['05B45, 62H30, 54E05, 68T10', '26A33',
       '49-06 (Primary), 49-11(Secondary)', '49Q10, 62H35', '54E40',
       '60D05, 62C99', '60G40, 65C60, 68T99', '62H30', '62P10, 62F15',
       '62P99', '65K05, 62F10, 65D19', '65K10, 68T45', '65Kxx, 65Yxx',
       '65Z05', '68', '68-06', '68R10, 05C50, 65F15, 65T50, 68T05, 62H30',
       '68T01', '68T05', '68T05, 68T45', '68T07', '68T10', '68T30',
       '68T45', '68T45 (Primary) 68T07 (Secondary)', '68T50, 68T05',
       '68U01', '68U10', '68U10 (Primary) 94A08, 54H30 (Secondary)',
       '68U10, 05C85', '68U10, 62M05, 62H30, 65C20', '68U10, 68W99',
       '94A08, 68U10, 65K10, 35K55, 49Q10', '97R40', 'C.1.3',
       'C.4; I.2.6; I.2.10; I.4.6; I.4.9; J.4',
       'Computing methodologies for image processing',
       'E.1; I.4; I.5; I.6', 'F.2.2', 'G.1.0; G.1.6', 'G.1.6', 'I.2',
       'I.2.0; I.5.0', 'I.2.1, I.4.6,', 'I.2.10',
       'I.2.10; I.2.10; I.2.1; I.4.6; I.4.7; I.4.8; I.4.9; I.5.2; I.5.5',
       'I.2.10; I.2.6', 'I.2.10; I

## Data preprocessing and `tf.data.Dataset` objects

Get percentile estimates of the sequence lengths. 

In [24]:
train_df["summaries"].apply(lambda x: len(x.split(" "))).describe()

count    11202.000000
mean       159.733976
std         40.835108
min          5.000000
25%        132.000000
50%        158.000000
75%        186.000000
max        292.000000
Name: summaries, dtype: float64

Notice that 50% of the abstracts have a length of 158. So, any number near that is a good approximate for the maximum sequence length. 

In [30]:
max_seqlen = 150
batch_size = 128


def unify_text_length(text, label):
    unified_text = tf.strings.substr(text, 0, max_seqlen)
    return tf.expand_dims(unified_text, -1), label


def make_dataset(dataframe, train=True):
    label_binarized = mlb.transform(dataframe["terms"].values)
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summaries"].values, label_binarized)
    )
    if train:
        dataset = dataset.shuffle(batch_size * 10)
    dataset = dataset.map(unify_text_length).cache()
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [31]:
train_dataset = make_dataset(train_df)
validation_dataset = make_dataset(val_df)

## Dataset preview

In [46]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text[0]}")
    print(f"Label(s): {mlb.inverse_transform(label)[0]}")
    print(" ")

Abstract: b'We study the effect of the stochastic gradient noise on the training of\ngenerative adversarial networks (GANs) and show that it can prevent the\nconver'
Label(s): ('cs.LG', 'math.OC', 'stat.ML')
 
Abstract: b'Sensitive medical data is often subject to strict usage constraints. In this\npaper, we trained a generative adversarial network (GAN) on real-world\nel'
Label(s): ('cs.LG',)
 
Abstract: b'Popular rotated detection methods usually use five parameters (coordinates of\nthe central point, width, height, and rotation angle) to describe the ro'
Label(s): ('cs.CV',)
 
Abstract: b'FPN is a common component used in object detectors, it supplements\nmulti-scale information by adjacent level features interpolation and summation.\nHow'
Label(s): ('cs.CV',)
 
Abstract: b'Data for Image segmentation models can be costly to obtain due to the\nprecision required by human annotators. We run a series of experiments showing\nt'
Label(s): ('cs.CV',)
 


## Vocabulary size for vectorization

In [61]:
train_df["total_words"] = train_df["summaries"].str.split().str.len()
train_df["total_words"].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


316

## Create model with `TextVectorization`

In [65]:
vocabulary_size = train_df["total_words"].max()
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))


def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            keras.Input(shape=(), dtype=tf.string),
            text_vectorizer,
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(len(mlb.classes_), activation="softmax"),
        ]
    )
    return shallow_mlp_model

With the CPU placement, we run into: 

```
(1) Invalid argument: During Variant Host->Device Copy: non-DMA-copy attempted of tensor type: string
```

In [66]:
shallow_mlp_model = make_model()
shallow_mlp_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_7 (TextVe (None, 316)               1         
_________________________________________________________________
dense_11 (Dense)             (None, 512)               162304    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_13 (Dense)             (None, 158)               40606     
Total params: 334,239
Trainable params: 334,238
Non-trainable params: 1
_________________________________________________________________


## Train the model

In [64]:
epochs = 20

shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

shallow_mlp_model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f95f6706510>