In [None]:
import torch
from torch import nn
import pandas as pd
import nltk
import re
from torchinfo import summary
import gensim
import numpy as np
import torchtext
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import time
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import SpectralEmbedding

# nltk.download('punkt')

## Load Data

In [None]:
data_abstracts = pd.read_csv('data/arxiv_data_210930-054931.csv', sep=',')
print(data_abstracts.shape)
print(data_abstracts.columns)
data_abstracts.head(3)

In [None]:
data_summaries = pd.read_csv('data/arxiv_data.csv', sep=',')
print(data_summaries.shape)
print(data_summaries.columns)
data_summaries.head(3)

## Prepare data

In [None]:
# Example patterns
# patterns = re.compile(r'([^\w\s]\s+)|(this badge holder)|(this badge earner)|(this earner)|(the badge holder)|(the badge earner)', flags=re.IGNORECASE)
patterns = re.compile(r'([^\w\s]\s+)', flags=re.IGNORECASE)
def clean_abstract(text):
    text = patterns.sub(r' ', text)
    text = re.sub(r' +', ' ', text).strip()
    tk_text = nltk.word_tokenize(text)
    new_text = ['' if len(x) < 2 or (x.isnumeric() and len(x) > 4) else x.lower() for x in tk_text]
    return ' '.join(new_text)

def yield_tokens(data_iter):
    for text in data_iter:
        yield nltk.word_tokenize(text)

def generate_vocabulary(data):
    vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(data), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab


def generate_embedding_matrix(vocabulary: torchtext.vocab.Vocab, embed_model):
    embedding_matrix = np.zeros((vocabulary.__len__(), embed_model.vector_size))
    for word, index in vocabulary.get_stoi().items():
        try:
            embedding_matrix[index] = embed_model[word]
        except:
            continue
    return embedding_matrix

In [None]:
data_abstracts['abs_cleaned'] = data_abstracts['abstracts'].apply(clean_abstract)
display(data_abstracts.head(3))
glove_model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.50d.txt', binary=False, no_header=True)
weights = torch.FloatTensor(glove_model.vectors)
vocab = generate_vocabulary(data_abstracts['abs_cleaned'].values)
embed_matrix = generate_embedding_matrix(vocab, glove_model)

In [None]:
max_words_text = data_abstracts['abs_cleaned'].apply(lambda x: len(x.split())).max()

In [None]:
def binary_codes(target):
    """
    Convert source to binary code
    :param target: array of values to convert to binary codes
    :return: binary codes
    """
    median = np.median(target, axis=1).reshape((target.shape[0], 1))
    binary = np.zeros(shape=np.shape(target))
    binary[target > median] = 1
    return binary

def low_dimensional_vector(term_freq_mat, embedding_matrix=None, lwdv='LSA'):
    """
    Generate Low dimensional vector
    :param tokenizer: tokenizer object
    :param sequences_full: sequences of text
    :param embedding_matrix: embedded matrix
    :param lwdv: Type of low dimensional vector
    :return: low dimensional vector
    """
    if lwdv == 'LE':  # Laplacian Eigenmaps (LE) # MEMORY EXPENSIVE
        le = SpectralEmbedding(n_components=300)  # explore parameters
        dim_reduction_le = le.fit_transform(term_freq_mat)
        return dim_reduction_le
    elif lwdv == 'AE':  # Average embedding (AE) # MEMORY EXPENSIVE
        denom = 1 + np.sum(term_freq_mat, axis=1)[:, None]
        normed_tfidf = term_freq_mat / denom
        average_embeddings = np.dot(normed_tfidf, embedding_matrix)
        return average_embeddings
    elif lwdv == 'LSA':  # LSA
        lsa = TruncatedSVD(n_components=300, algorithm="arpack")
        dim_reduction_lsa = lsa.fit_transform(term_freq_mat)
        return dim_reduction_lsa

In [None]:
text_pipeline = lambda x: vocab(nltk.tokenize.word_tokenize(x))
tfidf = TfidfVectorizer()
tfidf.fit(data_abstracts["abs_cleaned"].values)
lsa = TruncatedSVD(n_components=300, algorithm="arpack")
lsa.fit(tfidf.transform(data_abstracts["abs_cleaned"].values))

In [None]:
# If CUDA available, we can send to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch_stc(batch: list):
    text_list = []
    B = []
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        processed_text = nn.functional.pad(processed_text, pad=(0, max_words_text-processed_text.shape[0])) #(padding_left,padding_right)
        text_list.append(processed_text)
    Y = lsa.transform(tfidf.transform(batch)) # Low dimenstionality reduction LSA
    B = torch.tensor(binary_codes(Y), dtype=torch.float32)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return  B.to(device), text_list.to(device)

In [None]:
x_train_idx, x_test_idx = train_test_split(data_abstracts['abs_cleaned'].index,test_size=0.3, shuffle=True, random_state=42)
x_val_idx, x_test_idx = train_test_split(x_test_idx, test_size=0.66, shuffle=True, random_state=42)
BATCH_SIZE = 25
train_dataloader_stc = DataLoader(data_abstracts.loc[x_train_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_stc, num_workers=0)
test_dataloader_stc = DataLoader(data_abstracts.loc[x_test_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_stc, num_workers=0)
validate_dataloader_stc = DataLoader(data_abstracts.loc[x_val_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_stc, num_workers=0)

In [None]:
torch.save(train_dataloader_stc, "train_dataloader_stc.bin")
torch.save(test_dataloader_stc, "test_dataloader_stc.bin")
torch.save(validate_dataloader_stc, "validate_dataloader_stc.bin")

## Models

### Simple model

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.dropout = torch.nn.Dropout(p= 0.05) # Dropout
        self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) #lstm

        self.fc_1 =  torch.nn.Linear(hidden_size, hidden_size//2) #fully connected 1
        self.fc = torch.nn.Linear(hidden_size//2, 1) #fully connected last layer
        self.relu = torch.nn.ReLU()
    
    def forward(self,x):
        h_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
        c_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
        # Propagate input through LSTM
        _, (hn, cn) = self.lstm(x, (h_0.detach(), c_0.detach())) #lstm with input, hidden, and internal state
        hn_fs = hn.view(self.num_layers, x.size(0), self.hidden_size)[-1] #reshaping the data for Dense layer next
        out = self.dropout(hn_fs) # Dropout
        out = self.fc_1(out) # Dense
        out = self.relu(out) # Activation
        out = self.fc(out) # Dense
        out = self.relu(out) # Activation
        return out

In [None]:
class SimpleModelCNN(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size) -> None:
        super().__init__()
        # Define options for every element
        self.conv1d = torch.nn.Conv1d(in_channels, out_channels, kernel_size) # here
        self.maxpool = torch.nn.MaxPool1d(kernel_size=kernel_size) # here
        self.fc = torch.nn.Linear(in_features=20, out_features=20)
        self.sigmoid = torch.nn.Sigmoid()
        self.dropout = torch.nn.Dropout(0.5)
        self.tanh = torch.nn.Tanh()
    
    def forward(self, x):
        # Review
        x = self.conv1d(x)
        # x = self.tanh(x)
        x = torch.nn.functional.tanh(x)
        x = self.maxpool(x)
        x = self.dropout(x)
        x = self.fc(x)
        # x = self.sigmoid(x)
        x = torch.nn.functional.sigmoid(x)
        return x

In [None]:
summary(SimpleModelCNN(100, 100, 5)) # model summary

### CNN

In [None]:
class CNN_STC(nn.Module):
    def __init__(self, vocab_size, embed_dim, filter_size, num_filter, num_classes, dropout) -> None:
        super(CNN_STC, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) # check padding
        self.conv_1d = nn.Conv1d(in_channels=embed_dim, out_channels=num_filter, kernel_size=filter_size)
        self.pool_1d = nn.MaxPool1d(kernel_size=filter_size)
        self.dropout = nn.Dropout(dropout)
        
        in_lin = int(num_filter * ((embed_dim / filter_size) - 1))
        self.fc = nn.Linear(3000, num_classes) # based on the .view(size(0), -1)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv_1d(x)
        x = torch.tanh(x)
        x = self.pool_1d(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [None]:
model_cnn_stc = CNN_STC(len(vocab), 284, 4, 25, 300, 0.5)
summary(model_cnn_stc)

In [None]:
LR = 0.01  # learning rate

optimizer = torch.optim.Adam(model_cnn_stc.parameters(), lr=LR)
loss_function = torch.nn.BCELoss() # Input: (N, C), N=data, C=Total classes, Target:(N)

def train(epoch, data_loader, model):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()

    for idx, (label, text) in enumerate(data_loader):
        optimizer.zero_grad(True)
        predicted_label = model(text)
        loss = loss_function(predicted_label, label.squeeze_())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        # total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_acc += (predicted_label == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print("| epoch {:3d} | {:5d}/{:5d} batches | accuracy {:8.3f} | elapsed time {:2.2f}s".format(epoch, idx, len(data_loader), 
                                                                                                        total_acc / total_count, 
                                                                                                        elapsed))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(data_loader, model):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad(): # very important, if not we update the gradient
        for idx, (label, text) in enumerate(data_loader):
            predicted_label = model(text)
            loss = loss_function(predicted_label, label.squeeze_())
            # total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_acc += (predicted_label == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

def train_model(epochs, train_dataloader, validation_data_loader, model, total_accu=None):
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(epoch, train_dataloader, model)
        accu_val = evaluate(validation_data_loader, model)
        print("-" * 59)
        print("| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val))
        print("-" * 59)

def predict(model, data_loader):
    predicted_label = []
    with torch.no_grad(): # very important, if not we update the gradient
        for idx, (label, text) in enumerate(data_loader):
            predicted_label.append(model(text).argmax(1))
    return torch.cat(predicted_label)
    

In [None]:
train_model(1, train_dataloader_stc, validate_dataloader_stc, model_cnn_stc)

In [None]:
accu_test = evaluate(test_dataloader_stc, model_cnn_stc)

### Paper model

In [None]:
class CNN_STC_V2(nn.Module):
    def __init__(self, vocab_size, embed_dim, filter_size, num_filter, num_classes, dropout) -> None:
        super(CNN_STC_V2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) # check padding
        self.conv_1d_1 = nn.Conv1d(in_channels=embed_dim, out_channels=num_filter, kernel_size=filter_size)
        self.conv_1d_2 = nn.Conv1d(in_channels=num_filter, out_channels=num_filter//2, kernel_size=filter_size) # (in_channels=embed_dim, out_channels=num_filter, kernel_size=filter_size)
        self.max_pool_1d = nn.MaxPool1d(kernel_size=filter_size)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(3000, num_classes) # based on the .view(size(0), -1)
        self.filter_size = filter_size
        self.L = 2 # Number of Conv defined
    
    def folding(self, x):
        if x.size()[0] % 2 != 0:
            x = torch.stack([x[i] + x[i+1] for i in range(0, x.size()[0]-1, 2)]) # Every two rows
            x_last = x[x.size()[0]-1]
            x_last = x_last.view(1, x_last.size()[0], x_last.size()[1])
            x = torch.cat((x, x_last), 0)
        else:
            x = torch.stack([x[i] + x[i+1] for i in range(0, x.size()[0], 2)]) # Every two rows
        return x

    def dynamic_k_maxpooling(self, x, l, dim):
        kl = int(max(self.filter_size, (self.L-l)/self.L*x.size()[2]))
        index = x.topk(kl, dim = dim)[1].sort(dim = dim)[0]
        return x.gather(dim, index)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv_1d_1(x)
        x = torch.tanh(x)
        x = self.dynamic_k_maxpooling(x, 1, 2)
        x = self.conv_1d_2(x)
        x = torch.tanh(x)
        x = self.folding(x)
        x = self.max_pool_1d(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [None]:
import utils
# Cleaning patterns
patterns = re.compile(r'([^\w\s]\s+)', flags=re.IGNORECASE)

def clean_abstract(text):
        text = patterns.sub(r' ', text)
        text = re.sub(r' +', ' ', text).strip()
        tk_text = nltk.word_tokenize(text)
        new_text = ['' if len(x) < 2 or (x.isnumeric() and len(x) > 4) else x.lower() for x in tk_text]
        return ' '.join(new_text)

# Read data
data_abstracts = pd.read_csv('data/arxiv_data_210930-054931.csv', sep=',')
# Clean data
data_abstracts['abs_cleaned'] = data_abstracts['abstracts'].apply(clean_abstract)
# Split data into training, validating, testing
x_train_idx, x_test_idx = train_test_split(data_abstracts['abs_cleaned'].index,test_size=0.3, shuffle=True, random_state=42)
x_val_idx, x_test_idx = train_test_split(x_test_idx, test_size=0.66, shuffle=True, random_state=42)

# Load Glove Model if needed
# glove_model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.50d.txt', binary=False, no_header=True)
# weights = torch.FloatTensor(glove_model.vectors)
# embed_matrix = generate_embedding_matrix(vocab, glove_model)

# Generate vocabulary
utils.vocab = utils.generate_vocabulary(data_abstracts['abs_cleaned'].values)
# Vocab + tokenization function
utils.text_pipeline = lambda x: utils.vocab(nltk.tokenize.word_tokenize(x))
# Get maximum words in a sentence, used for padding
utils.max_words_text = data_abstracts['abs_cleaned'].apply(lambda x: len(x.split())).max()
# Fit TF-IDF and LSA
utils.tfidf.fit(data_abstracts["abs_cleaned"].values)
utils.lsa.fit(utils.tfidf.transform(data_abstracts["abs_cleaned"].values))
# Create DataLoaders
BATCH_SIZE = 25
train_dataloader_stc = DataLoader(data_abstracts.loc[x_train_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=utils.collate_batch_stc)
test_dataloader_stc = DataLoader(data_abstracts.loc[x_test_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, 
                                collate_fn=utils.collate_batch_stc)
validate_dataloader_stc = DataLoader(data_abstracts.loc[x_val_idx]["abs_cleaned"].values, batch_size=BATCH_SIZE, shuffle=True, 
                                        collate_fn=utils.collate_batch_stc)

In [None]:
import utils
from models import CNN_STC
import train as tr
model_cnn_stc = CNN_STC(len(utils.vocab), utils.max_words_text, 4, 25, 300, 0.5)
print(summary(model_cnn_stc))
# Define Variables
tr.LR = 0.01 # By defult, customisable
tr.optimizer = torch.optim.Adam(model_cnn_stc.parameters(), lr=tr.LR)
tr.loss_function = torch.nn.BCELoss() # Input: (N, C), N=data, C=Total classes, Target:(N)

In [None]:
tr.train_model(epochs=1, train_dataloader=train_dataloader_stc, validation_data_loader=validate_dataloader_stc, model=model_cnn_stc)

In [None]:
model = CNN_STC_V2(len(utils.vocab), utils.max_words_text, 4, 25, 300, 0.5)

In [None]:
import time

# ------------------------------------- #
# Define training parameters
# ------------------------------------- #
LR = 0.01  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_function = torch.nn.BCELoss()

def train(epoch, data_loader, model):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()

    for idx, (label, text) in enumerate(data_loader):
        optimizer.zero_grad(True)
        predicted_label = model(text)
        loss = loss_function(predicted_label, label.squeeze_())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        # total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_acc += (predicted_label == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print("| epoch {:3d} | {:5d}/{:5d} batches | accuracy {:8.3f} | elapsed time {:2.2f}s".format(epoch, idx, len(data_loader), 
                                                                                                        total_acc / total_count, 
                                                                                                        elapsed))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(data_loader, model):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad(): # very important, if not we update the gradient
        for idx, (label, text) in enumerate(data_loader):
            predicted_label = model(text)
            loss = loss_function(predicted_label, label.squeeze_())
            # total_acc += (predicted_label.argmax(1) == label).sum().item() # If we have one output possible class
            total_acc += (predicted_label == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

def train_model(epochs, train_dataloader, validation_data_loader, model, total_accu=None):
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(epoch, train_dataloader, model)
        accu_val = evaluate(validation_data_loader, model)
        print("-" * 59)
        print("| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val))
        print("-" * 59)

def predict(model, data_loader):
    predicted_label = []
    with torch.no_grad(): # very important, if not we update the gradient
        for idx, (label, text) in enumerate(data_loader):
            predicted_label.append(model(text).argmax(1))
    return torch.cat(predicted_label)


In [None]:
model.train()
total_acc, total_count = 0, 0
log_interval = 50
for epoch in range(1, 1 + 1):
        epoch_start_time = time.time()
        train(epoch, train_dataloader_stc, model)
        accu_val = evaluate(validate_dataloader_stc, model)
        print("-" * 59)
        print("| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val))
        print("-" * 59)

In [None]:
x = torch.rand(4, 5, 10)
s = x.size()[2]
k_ll = ((2 - 1) / 2) * s
k_l = round(max(4, np.ceil(k_ll)))
out = torch.adaptive_max_pool1d(x, k_l)
out

In [None]:
import torch
x = torch.rand(4, 5, 10)
index = x.topk(4, dim = 2)[1].sort(dim = 2)[0]
y = x.gather(2, index)
y