In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/multilabel-classification-dataset/train.csv")

In [None]:
def preprocess_input(df):
    
    def tokenize():
        pass
    def stemming():
        pass
    
    def get_id_from_text():
        pass
    def get_text_from_id():
        pass
        

In [None]:
label_tags = ['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']
label_tags
input_tags = ['TITLE', 'ABSTRACT']

**Data Preprocessing**

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
from sklearn.model_selection import train_test_split

In [None]:
from tensorflow.keras.layers import TextVectorization
import re
import string
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.

def get_vectorize_layer(vocab_size=10000, sequence_length=100):
    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=sequence_length)
    return vectorize_layer

def get_text_from_tokens(tensor, vocab):
    text = ""
    for i in tensor.numpy():
        if vocab[i]:
            text =text +  f" {vocab[i]}"

    return text
        
    

In [None]:
def get_labels(df, tags):
    labels = []
    for i, row in df.iterrows():
        encode = [row[j] for j in tags]
        labels.append(encode)
    return labels

def get_train_val_input_data(df, label_cols):
    train_data, valid_data = train_test_split(df, train_size=0.8, shuffle=True)
    train_title_data = list(train_data['TITLE'])
    train_abstract_data = list(train_data['ABSTRACT'])
    valid_title_data = list(valid_data['TITLE'])
    valid_abstract_data = list(valid_data['ABSTRACT'])
    train_labels = get_labels(train_data, label_cols)
    valid_labels = get_labels(valid_data, label_cols)
    input_data = {
        'train_data' : (train_title_data, train_abstract_data),
        'train_label' : train_labels,
        'valid_data' : (valid_title_data, valid_abstract_data),
        'valid_label': valid_labels
    }
    return input_data

In [None]:
def get_dataset(input_data, labels, title_vectorize_layer, abstract_vectorize_layer, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((input_data, labels)).map(lambda x,y : ((title_vectorize_layer(x[0]), abstract_vectorize_layer(x[1])), y)).batch(batch_size)
    return dataset

In [None]:
title_vectorize_layer = get_vectorize_layer(50000)
abstract_vectorize_layer = get_vectorize_layer(50000 ,sequence_length=200)

In [None]:
title_vectorize_layer.adapt(list(df['TITLE']))

In [None]:
abstract_vectorize_layer.adapt(list(df['ABSTRACT']))

In [None]:
#check sample representation : 
title_vectorize_layer(df['TITLE'][400])

In [None]:
input_data = get_train_val_input_data(df, label_tags)

In [None]:
train_data = get_dataset(input_data['train_data'], input_data['train_label'], title_vectorize_layer, abstract_vectorize_layer, 64)

In [None]:
valid_data = get_dataset(input_data['valid_data'], input_data['valid_label'], title_vectorize_layer, abstract_vectorize_layer, 64)

In [None]:
def get_model():
    input1, input2= tf.keras.layers.Input(shape=(100, ), name='title_input'), tf.keras.layers.Input(shape=(200, ), name='abstract_input')
#     inputs = tf.keras.layers.Concatenate(axis=-1, name='concatenate_layer')([input1, input2])
    title_embs = tf.keras.layers.Embedding(50000, 64, name='title_embedding_layer')(input1)
    abstract_embs = tf.keras.layers.Embedding(50000, 64, name='abstract_embedding_layer')(input2)
#     out = tf.keras.layers.GlobalAveragePooling1D()(embs)
    title_lstm = tf.keras.layers.LSTM(6, return_sequences=False, return_state=False)
    abstract_lstm = tf.keras.layers.LSTM(6, return_sequences=False, return_state=False)
    title_whole_seq_output = title_lstm(title_embs)
    abstract_whole_seq_output = abstract_lstm(abstract_embs)
    merge_layer = tf.keras.layers.Add()([title_whole_seq_output, abstract_whole_seq_output])
    outs = tf.keras.layers.Dense(6, activation='sigmoid')(merge_layer)
    
    return tf.keras.Model((input1, input2), outs)
    
def predict(model, title, abstract):
    title = title_vectorize_layer(title)
#     title = tf.expand_dims(title, 0)
    
    abstract = abstract_vectorize_layer(abstract)
#     abstract = tf.expand_dims(abstract, 0)
    print(title.get_shape(), abstract.get_shape())
    resp = model.predict((title, abstract))
    return resp


In [None]:
model = get_model()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
history = model.fit(train_data, validation_data=valid_data, epochs=10)

In [None]:
import matplotlib.pyplot as plt
plt.plot([i for i in range(0, history.params['epochs'])], history.history['loss'], label='train_loss')
plt.plot([i for i in range(0, history.params['epochs'])], history.history['val_loss'], label='val_loss')
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()


In [None]:
plt.plot([i for i in range(0, history.params['epochs'])], history.history['accuracy'], label='train_acc')
plt.plot([i for i in range(0, history.params['epochs'])], history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

**TEST DATASET PREDICTION**

In [None]:
test_df = pd.read_csv('/kaggle/input/multilabel-classification-dataset/test.csv')

In [None]:
test_preds = predict(model, test_df['TITLE'], test_df['ABSTRACT'])

In [None]:
results = pd.DataFrame(test_preds, columns=label_tags)
for i in label_tags:
    results[i] = results[i].apply(lambda x: 1 if x >=0.5 else 0)

In [None]:
results['TITLE'] = test_df['TITLE']
results['ABSTRACT'] = test_df['ABSTRACT']

In [None]:
results