In [1]:
import ast
import json
import string
import re
import pandas as pd
import pke
import nltk
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pandas import json_normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForTokenClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set file names for data
file_abstract = '..\\data\\benchmark_data\\NUS.json'
file_summaries = '..\\data\\benchmark_data\\summarization_experiment\\NUS_summarized.csv'

# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))
data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Combine title and abstract, preprocess text
def preprocess_text(text):
    # Apply contractions
    def get_contractions():
        contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because"}
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    # Substitute contractions with full words
    text = replace_contractions(text)

    # Remove brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    return text

data_abstract['abstract'] = data_abstract['abstract'].apply(preprocess_text)
data_summaries['abstract'] = data_summaries['abstract'].apply(preprocess_text)


In [5]:
# Extract keyphrases
def extract_keyphrases(data):
    extractor = pke.unsupervised.MultipartiteRank()
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    
    keyphrases = []
    for abstract in data['abstract']:
        extractor.load_document(input=abstract, normalization="stemming")
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average')
        pred_kps = extractor.get_n_best(n=10)
        keyphrases.append([kp[0].split() for kp in pred_kps])
    
    return keyphrases

pred_keyphrases_abstract = extract_keyphrases(data_abstract)
pred_keyphrases_summaries = extract_keyphrases(data_summaries)

# Combine abstract and summaries
data_summaries['abstract'] = data_abstract['abstract'] + ' ' + data_summaries['abstract']


In [56]:
import torch
from transformers import BertForTokenClassification, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from transformers import AdamW
from tqdm import trange

# Fine-tune BERT for keyword extraction
class BertSequenceTagger(torch.nn.Module):
    def __init__(self, num_labels, pretrained_model="bert-base-uncased"):
        super(BertSequenceTagger, self).__init__()
        self.num_labels = num_labels
        self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

# Tokenize and encode sequences
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

# Encode data
def encode_data(sentences, labels, max_len):
    input_ids = []
    attention_masks = []
    flat_labels = []

    for sentence, label_list in zip(sentences, labels):
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      # Sentence to encode
                            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
                            max_length=max_len,           # Pad & truncate all sentences.
                            padding='max_length',
                            truncation=True,              # Explicitly truncate to max length
                            return_attention_mask=True,   # Construct attn. masks.
                            return_tensors='pt',          # Return pytorch tensors.
                       )
        
        # Add the encoded sentence to the list
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding)
        attention_masks.append(encoded_dict['attention_mask'])
        
        # Encode labels for each tokenized word
        encoded_labels = []
        for sublist in label_list:
            encoded_labels.extend([label_map[label] for label in sublist])
        flat_labels.append(encoded_labels)

    # Convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels_tensor = torch.tensor(flat_labels)

    return input_ids, attention_masks, labels_tensor

# Define label mapping
flat_train_labels = [tuple(label) if isinstance(label, list) else label for sublist in train_labels for label in sublist]
unique_labels = set(flat_train_labels)
label_map = {label: i for i, label in enumerate(unique_labels)}

# Split the data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(data_abstract['abstract'], pred_keyphrases_abstract, 
                                                                            random_state=42, test_size=0.1)

# Tokenize all of the sentences and map the tokens to their word IDs
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(train_sentences, train_labels)]

# Split sentences and labels
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

# Check for missing labels and add them to label_map
unique_labels_in_data = set(label for sublist in labels for label in sublist if isinstance(sublist, list))
missing_labels = unique_labels_in_data - set(label_map.keys())

# Convert lists to tuples to make them hashable
labels = [tuple(sublist) if isinstance(sublist, list) else sublist for sublist in labels]

for label in missing_labels:
    label_map[label] = len(label_map)

# Encode data
input_ids, attention_masks, labels = encode_data(tokenized_texts, labels, MAX_LEN)

# Verify the shape of labels tensor
print("Labels shape:", labels.shape)





# Create the DataLoader for training set
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=bs)

# Load BERT model for token classification
model = BertSequenceTagger(num_labels=len(label_map))

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and scheduler
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)

# Train the model
epochs = 3

for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))


TypeError: unhashable type: 'list'

In [44]:
from tqdm import trange


    

for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        
        
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))


Epoch:   0%|                                                                                     | 0/3 [00:14<?, ?it/s]


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [None]:
# Validation
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
    logits = outputs.logits

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.append(label_ids)

# Flatten the predictions and the true values for evaluation
pred_tags = [tag for pred in predictions for tag in pred]
valid_tags = [tag for valid in true_labels for tag in valid]

# Classification report
print(classification_report(valid_tags, pred_tags))

# Evaluate
gold_keyphrases = ast.literal_eval(data_summaries['keywords'].to_json(orient='values'))

# Convert predictions to keyword phrases
def convert_to_keyphrases(predictions, sentences):
    keyphrases = []
    for pred, sent in zip(predictions, sentences):
        keyphrase = [word for word, tag in zip(sent, pred) if tag == 1]
        keyphrases.append(keyphrase)
    return keyphrases


In [None]:
# Predict
val_sentences = val_sentences.values.tolist()
pred_keyphrases = convert_to_keyphrases(predictions, val_sentences)

# Evaluation
traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data_summaries, x_filename='')