<a href="https://colab.research.google.com/github/sayakpaul/A-B-testing-with-Machine-Learning/blob/master/multi_label_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from ast import literal_eval
import pandas as pd

## Read data and perform basic EDA

In [None]:
arxiv_data = pd.read_csv(
    "https://github.com/soumik12345/multi-label-text-classification/releases/download/v0.2/arxiv_data.csv"
)
arxiv_data.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [None]:
print(f"There are {len(arxiv_data)} rows in the dataset.")

There are 51774 rows in the dataset.


In [None]:
total_duplicate_titles = sum(arxiv_data["titles"].duplicated())
print(f"There are {total_duplicate_titles} duplicate titles.")

There are 12802 duplicate titles.


In [None]:
arxiv_data = arxiv_data[~arxiv_data["titles"].duplicated()]
print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")

There are 38972 rows in the deduplicated dataset.


In [None]:
# There are some terms with occurence as low as 1.
sum(arxiv_data["terms"].value_counts() == 1)

2321

In [None]:
# How many unique terms?
arxiv_data["terms"].nunique()

3157

In [None]:
# Filtering the rare terms.
arxiv_data_filtered = arxiv_data.groupby("terms").filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(36651, 3)

## Convert the string labels to list of strings. 

The initial labels are represented as raw strings. Here we make them `List[str]` for a more compact representation. 

In [None]:
arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
    lambda x: literal_eval(x)
)
arxiv_data_filtered["terms"].values[:5]

array([list(['cs.CV', 'cs.LG']), list(['cs.CV', 'cs.AI', 'cs.LG']),
       list(['cs.CV', 'cs.AI']), list(['cs.CV']),
       list(['cs.CV', 'cs.LG'])], dtype=object)

## Stratified splits because of class imbalance

In [None]:
test_split = 0.1

# Initial train and test split.
train_df, test_df = train_test_split(
    arxiv_data_filtered,
    test_size=test_split,
    stratify=arxiv_data_filtered["terms"].values,
)

# Splitting the test set further into validation
# and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 32985
Number of rows in validation set: 1833
Number of rows in test set: 1833


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Multi-label binarization

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit_transform(train_df["terms"])
mlb.classes_

array(['14J60 (Primary) 14F05, 14J26 (Secondary)', '62H30', '62H35',
       '62H99', '65D19', '68', '68Q32', '68T01', '68T05', '68T07',
       '68T10', '68T30', '68T45', '68T99', '68Txx', '68U01', '68U10',
       'E.5; E.4; E.2; H.1.1; F.1.1; F.1.3', 'F.2.2; I.2.7', 'G.3',
       'H.3.1; H.3.3; I.2.6; I.2.7', 'H.3.1; I.2.6; I.2.7', 'I.2',
       'I.2.0; I.2.6', 'I.2.1', 'I.2.10', 'I.2.10; I.2.6',
       'I.2.10; I.4.8', 'I.2.10; I.4.8; I.5.4', 'I.2.10; I.4; I.5',
       'I.2.10; I.5.1; I.4.8', 'I.2.1; J.3', 'I.2.6', 'I.2.6, I.5.4',
       'I.2.6; I.2.10', 'I.2.6; I.2.7', 'I.2.6; I.2.7; H.3.1; H.3.3',
       'I.2.6; I.2.8', 'I.2.6; I.2.9', 'I.2.6; I.5.1', 'I.2.6; I.5.4',
       'I.2.7', 'I.2.8', 'I.2; I.2.6; I.2.7', 'I.2; I.4; I.5', 'I.2; I.5',
       'I.2; J.2', 'I.4', 'I.4.0', 'I.4.3', 'I.4.4', 'I.4.5', 'I.4.6',
       'I.4.6; I.4.8', 'I.4.8', 'I.4.9', 'I.4.9; I.5.4', 'I.4; I.5',
       'I.5.4', 'K.3.2', 'astro-ph.IM', 'cond-mat.dis-nn',
       'cond-mat.mtrl-sci', 'cond-mat.soft', 'c

## Data preprocessing and `tf.data.Dataset` objects

Get percentile estimates of the sequence lengths. 

In [None]:
train_df["summaries"].apply(lambda x: len(x.split(" "))).describe()

count    32985.000000
mean       156.502471
std         41.538054
min          5.000000
25%        128.000000
50%        154.000000
75%        183.000000
max        462.000000
Name: summaries, dtype: float64

Notice that 50% of the abstracts have a length of 158. So, any number near that is a good enough approximate for the maximum sequence length. 

In [None]:
max_seqlen = 150
batch_size = 128


def unify_text_length(text, label):
    unified_text = tf.strings.substr(text, 0, max_seqlen)
    return tf.expand_dims(unified_text, -1), label


def make_dataset(dataframe, train=True):
    label_binarized = mlb.transform(dataframe["terms"].values)
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summaries"].values, label_binarized)
    )
    if train:
        dataset = dataset.shuffle(batch_size * 10)
    dataset = dataset.map(unify_text_length).cache()
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
train_dataset = make_dataset(train_df)
validation_dataset = make_dataset(val_df, False)
test_dataset = make_dataset(test_df, False)

## Dataset preview

In [None]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text[0]}")
    print(f"Label(s): {mlb.inverse_transform(label)[0]}")
    print(" ")

Abstract: b'We study the effect of the stochastic gradient noise on the training of\ngenerative adversarial networks (GANs) and show that it can prevent the\nconver'
Label(s): ('cs.LG', 'math.OC', 'stat.ML')
 
Abstract: b'Sensitive medical data is often subject to strict usage constraints. In this\npaper, we trained a generative adversarial network (GAN) on real-world\nel'
Label(s): ('cs.LG',)
 
Abstract: b'Popular rotated detection methods usually use five parameters (coordinates of\nthe central point, width, height, and rotation angle) to describe the ro'
Label(s): ('cs.CV',)
 
Abstract: b'FPN is a common component used in object detectors, it supplements\nmulti-scale information by adjacent level features interpolation and summation.\nHow'
Label(s): ('cs.CV',)
 
Abstract: b'Data for Image segmentation models can be costly to obtain due to the\nprecision required by human annotators. We run a series of experiments showing\nt'
Label(s): ('cs.CV',)
 


## Vocabulary size for vectorization

In [None]:
train_df["total_words"] = train_df["summaries"].str.split().str.len()
vocabulary_size = train_df["total_words"].max()
print(f"Vocabulary size: {vocabulary_size}")

Vocabulary size: 498


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Create model with `TextVectorization`

In [None]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))


def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            keras.Input(shape=(), dtype=tf.string),
            text_vectorizer,
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(len(mlb.classes_), activation="softmax"),
        ]
    )
    return shallow_mlp_model

With the CPU placement, we run into: 

```
(1) Invalid argument: During Variant Host->Device Copy: non-DMA-copy attempted of tensor type: string
```

In [None]:
shallow_mlp_model = make_model()
shallow_mlp_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 498)               1         
_________________________________________________________________
dense (Dense)                (None, 512)               255488    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 152)               39064     
Total params: 425,881
Trainable params: 425,880
Non-trainable params: 1
_________________________________________________________________


## Train the model

In [None]:
epochs = 20

shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

shallow_mlp_model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f70b6e88810>

## Evaluate the model

In [None]:
_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")

Categorical accuracy on the test set: 62.3%.
