In [1]:
import ast  
import pke
import json
import string
import pandas as pd
import traditional_evaluation
import nltk
from nltk.corpus import stopwords
from pandas import json_normalize
from nltk.stem.snowball import SnowballStemmer as Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
import numpy as np


In [2]:
# Preprocessing Functions
import re

def get_contractions():
    contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because",
                        "could've": "could have", "couldn't": "could not", "didn't": "did not",
                        "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                        "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is",
                        "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                        "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                        "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                        "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have",
                        "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will",
                        "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam",
                        "mayn't": "may not", "might've": "might have", "mightn't": "might not",
                        "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
                        "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                        "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                        "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                        "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
                        "she'll've": "she will have", "she's": "she is", "should've": "should have",
                        "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                        "so's": "so as", "this's": "this is", "that'd": "that would",
                        "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                        "there'd've": "there would have", "there's": "there is", "here's": "here is",
                        "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                        "they'll've": "they will have", "they're": "they are", "they've": "they have",
                        "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                        "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                        "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                        "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
                        "when've": "when have", "where'd": "where did", "where's": "where is",
                        "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                        "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                        "will've": "will have", "won't": "will not", "won't've": "will not have",
                        "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                        "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
                        "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
                        "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                        "you're": "you are", "you've": "you have", "nor": "not", "'s": "s", "s'": "s"}

    contractions_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contractions_re

def replace_contractions(text):
    contractions, contractions_re = get_contractions()

    def replace(match):
        return contractions[match.group(0)]

    return contractions_re.sub(replace, text)

    
newLine_tabs = '\t' + '\n'
newLine_tabs_table = str.maketrans(newLine_tabs, ' ' * len(newLine_tabs))
punctuation = string.punctuation  # + '\t' + '\n'
#punctuation = punctuation.replace("'", '')  # do not delete '
table = str.maketrans(punctuation, ' '*len(punctuation))

def remove_punct_and_non_ascii(text):
    clean_text = text.translate(table)
    clean_text = clean_text.encode("ascii", "ignore").decode()  # remove non-ascii characters
    # remove all single letter except from 'a' and 'A'
    clean_text = re.sub(r"\b[b-zB-Z]\b", "", clean_text)
    return clean_text
def remove_brackets_and_contents(doc):
    """
    remove parenthesis, brackets and their contents
    :param doc: initial text document
    :return: text document without parenthesis, brackets and their contents
    """
    ret = ''
    skip1c = 0
    # skip2c = 0
    for i in doc:
        if i == '[':
            skip1c += 1
        # elif i == '(':
        # skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        # elif i == ')'and skip2c > 0:
        # skip2c -= 1
        elif skip1c == 0:  # and skip2c == 0:
            ret += i
    return ret

def remove_newline_tabs(text):
    return text.replace('\n', ' ').replace('\t', ' ')

def remove_references(doc):
    """
    remove references of publications (in document text)
    :param doc: initial text document
    :return: text document without references
    """
    # delete newline and tab characters
    clear_doc = doc.translate(newLine_tabs_table)

    # remove all references of type "Author, J. et al., 2014"
    clear_doc = re.sub(r'[A-Z][a-z]+,\s[A-Z][a-z]*\. et al.,\s\d{4}', "REFPUBL", clear_doc)

    # remove all references of type "Author et al. 1990"
    clear_doc = re.sub("[A-Z][a-z]+ et al. [0-9]{4}", "REFPUBL", clear_doc)

    # remove all references of type "Author et al."
    clear_doc = re.sub("[A-Z][a-z]+ et al.", "REFPUBL", clear_doc)

    return clear_doc

def preprocessing(text):
    text = replace_contractions(text)
    text = remove_punct_and_non_ascii(text)
    text = remove_brackets_and_contents(text)
    text = remove_newline_tabs(text)
    text = remove_references(text)
    return text


In [3]:
# NLP Components
def extract_keyphrases(data, tfidf_matrix, glove_model):
    gold_keyphrases = []  
    pred_keyphrases = []  
    for indx, abstract_document in enumerate(data['abstract']):
        abstract_document = preprocessing(abstract_document)
        
        gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')])
        
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=abstract_document, normalization="stemming")
        pos = {'NOUN', 'PROPN', 'ADJ'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos)
        
        # Compute similarity scores between candidate phrases and document using GloVe embeddings
        candidate_scores = []
        for phrase in extractor.candidates:
            candidate_embedding = np.mean([glove_model.get(word, np.zeros((100,))) for word in phrase[0].split()], axis=0)
            document_embedding = np.mean([glove_model.get(word, np.zeros((100,))) for word in abstract_document.split()], axis=0)
            similarity_score = np.dot(candidate_embedding, document_embedding) / (np.linalg.norm(candidate_embedding) * np.linalg.norm(document_embedding))
            candidate_scores.append(similarity_score)  # Keep similarity scores directly
            
        # Using similarity scores for candidate phrases weighting
        extractor.candidate_weighting(method='average')  # Set method only
        pred_kps = extractor.get_n_best(n=10)
        pred_keyphrases.append([kp[0].split() for kp in pred_kps])
    return pred_keyphrases, gold_keyphrases


In [4]:
def phrase_identification_tfidf(data):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(data['abstract'])
    return tfidf_matrix

def load_glove_model(glove_file):
    word_embeddings = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
    return word_embeddings

def load_bert_embeddings(data):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    embeddings = []
    for abstract in data['abstract']:
        input_ids = tokenizer.encode(abstract, add_special_tokens=True, max_length=512, truncation=True)
        input_ids = torch.tensor(input_ids).unsqueeze(0)
        with torch.no_grad():
            outputs = model(input_ids)
            embeddings.append(outputs[0][:, 0, :].numpy())  # Extracting the [CLS] token embedding
    return embeddings


In [5]:
# Set file names
file_abstract = '..\\data\\benchmark_data\\NUS.json' 
file_summaries = '..\\data\\benchmark_data\\summarization_experiment\\NUS_summarized.csv'

# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))

data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")


In [6]:
# Combine title and abstract
import re
def combine_text(data):
    for index, abstract in enumerate(data['abstract']):
        title_abstract_summary = data['title'][index] + '. ' + abstract
        title_abstract_summary = preprocessing(title_abstract_summary)
        data['abstract'].iat[index] = title_abstract_summary
    if 'keywords' in data.columns:
        data.rename(columns={"keywords": "keyword"}, inplace=True)
    return data

data_abstract = combine_text(data_abstract)
data_summaries = combine_text(data_summaries)


In [7]:
# Extract keyphrases
tfidf_matrix = phrase_identification_tfidf(data_abstract)
glove_model = load_glove_model('GloVe\\glove.6B\\glove.6B.100d.txt')
pred_keyphrases_abstract, gold_keyphrases = extract_keyphrases(data_abstract, tfidf_matrix, glove_model)


In [8]:
print(pred_keyphrases_abstract[0])
print(gold_keyphrases[0])

[['source', 'dispersers'], ['sub', 'polynomial', 'entropy'], ['ramsey', 'graphs'], ['frankl', 'wilson', 'construction'], ['extractor', 'ideas'], ['independent', 'sources'], ['main', 'result'], ['explicit', 'construction'], ['boolean', 'matrices'], ['explicit', 'disperser']]
[['sum-product', 'theorem'], ['distribut'], ['explicit', 'dispers'], ['construct', 'of', 'dispers'], ['extractor'], ['recurs'], ['subsourc', 'somewher', 'extractor'], ['structur'], ['bipartit', 'graph'], ['extractor'], ['independ', 'sourc'], ['extractor'], ['tool'], ['ramsey', 'graph'], ['dispers'], ['polynomi', 'time', 'comput', 'dispers'], ['resili'], ['theorem'], ['ramsey', 'graph'], ['block-sourc'], ['defici'], ['termin'], ['entropi'], ['ramsey', 'graph'], ['independ', 'sourc'], ['algorithm'], ['independ', 'sourc'], ['subsourc'], ['dispers'], ['random', 'extract']]


In [11]:
pred_keyphrases_summaries, _ = extract_keyphrases(data_summaries, tfidf_matrix, glove_model)

In [12]:
print(pred_keyphrases_summaries[0])

[['source', 'dispersers'], ['ramsey', 'graphs'], ['sub', 'polynomial', 'entropy'], ['extractor', 'ideas'], ['entropy'], ['frankl', 'wilson', 'construction'], ['extractor'], ['independent', 'sources'], ['explicit', 'construction'], ['main', 'result']]


In [13]:
pred_keyphrases = [pred_abstract + pred_summaries  for pred_abstract, pred_summaries in zip(pred_keyphrases_abstract, pred_keyphrases_summaries)]

In [14]:
traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data_summaries, x_filename='')

                                                 title  \
0    2-Source Dispersers for Sub-Polynomial Entropy...   
1    A Frequency-based and a Poisson-based Definiti...   
2                     High Performance Crawling System   
3    Hiperlan/2 Public Access Interworking with 3G ...   
4                              2D Information Displays   
..                                                 ...   
206                   Formally Deriving an STG Machine   
207      Building Bridges for Web Query Classification   
208      Geographically Focused Collaborative Crawling   
209  GraalBench: A 3D Graphics Benchmark Suite for ...   
210  Handoff Trigger Table for Integrated 3G/WLAN N...   

                                              abstract  \
0    [2, Source, Dispersers, for, Sub, Polynomial, ...   
1    [A, Frequency, based, and, a, Poisson, based, ...   
2    [High, Performance, Crawling, System, In, the,...   
3    [Hiperlan, 2, Public, Access, Interworking, wi...   
4    [2D, Inf

In [22]:
# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))
data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")

# Combine title and abstract, preprocess text
def preprocess_text(text):
    # Apply contractions
    def get_contractions():
        contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because"}
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    # Substitute contractions with full words
    text = replace_contractions(text)

    # Remove brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    return text

data_abstract['abstract'] = data_abstract['abstract'].apply(preprocess_text)
data_summaries['abstract'] = data_summaries['abstract'].apply(preprocess_text)


In [23]:
# ======================================================================================================================
# Semantic Similarity
# ======================================================================================================================
def calculate_semantic_similarity(texts):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

# Combine abstracts and summaries for semantic similarity calculation
combined_texts = data_abstract['abstract'].tolist() + data_summaries['abstract'].tolist()
semantic_similarity_matrix = calculate_semantic_similarity(combined_texts)


In [24]:
import ast
import json
import string
import re
import pandas as pd
import pke
import nltk
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pandas import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import traditional_evaluation
from numpy import savez_compressed
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx

stemmer = PorterStemmer()

def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

# Tokenize and create n-grams
cv = CountVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))
cv.fit(data_abstract['abstract'])

X = cv.transform(data_abstract['abstract'])




In [18]:
nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
def extract_keyphrases(data):
    extractor = pke.unsupervised.MultipartiteRank()
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    
    keyphrases = []
    for abstract in data['abstract']:
        extractor.load_document(input=abstract, normalization="stemming")
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average')
        pred_kps = extractor.get_n_best(n=10)
        keyphrases.append([kp[0].split() for kp in pred_kps])
    
    return keyphrases

pred_keyphrases_abstract1 = extract_keyphrases(data_abstract)
pred_keyphrases_summaries1 = extract_keyphrases(data_summaries)

# Combine abstract and summaries
data_summaries['abstract'] = data_abstract['abstract'] + ' ' + data_summaries['abstract']

In [27]:
# ======================================================================================================================
# Named Entity Recognition (NER)
# ======================================================================================================================
import spacy

def extract_named_entities(text, filter_entities=True):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if filter_entities:
            # Exclude named entities that are person names or dates
            if ent.label_ not in ['PERSON', 'DATE']:
                entities.append(ent.text)
        else:
            entities.append(ent.text)
    return entities

data_abstract['abstract'] = data_abstract['abstract'].apply(lambda x: ' '.join(x))
data_summaries['abstract'] = data_summaries['abstract'].apply(lambda x: ' '.join(x))

# Apply the modified function to filter out specific named entities
data_abstract['abstract'] = data_abstract['abstract'].apply(lambda x: extract_named_entities(x, filter_entities=True))
data_summaries['abstract'] = data_summaries['abstract'].apply(lambda x: extract_named_entities(x, filter_entities=True))

# ======================================================================================================================
# Named Entity Recognition (NER)
# ======================================================================================================================
def column_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities
# Ensure the 'abstract' column contains strings
data_abstract['abstract'] = data_abstract['abstract'].apply(lambda x: ' '.join(x))
data_summaries['abstract'] = data_summaries['abstract'].apply(lambda x: ' '.join(x))

# Apply the modified function to filter out specific named entities
data_abstract['named_entities'] = data_abstract['abstract'].apply(column_named_entities)
data_summaries['named_entities'] = data_summaries['abstract'].apply(column_named_entities)


In [28]:
# ======================================================================================================================
# Topic Modeling
# ======================================================================================================================
def perform_topic_modeling(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
    lda_model.fit(tfidf_matrix)
    return lda_model

# Fit LDA model on combined abstracts and summaries
lda_model = perform_topic_modeling(combined_texts)

In [29]:
# ======================================================================================================================
# Graph-based Methods (TextRank)
# ======================================================================================================================
def textrank_keyphrases(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    keyword_phrases = []

    for sentence in sentences:
        words = sentence.split()
        graph = nx.Graph()

        # Add nodes to the graph
        for word in words:
            graph.add_node(word)

        # Add edges to the graph
        for i in range(len(words)):
            for j in range(i+1, len(words)):
                similarity = semantic_similarity_matrix[i][j]
                if similarity > 0.2:  # Threshold for edge creation
                    graph.add_edge(words[i], words[j])

        # Calculate TextRank scores
        scores = nx.pagerank(graph)

        # Select top keywords based on TextRank scores
        keywords = sorted(scores, key=scores.get, reverse=True)[:5]
        keyword_phrases.append(keywords)

    return keyword_phrases

pred_keyphrases_abstract_textrank = data_abstract['abstract'].apply(textrank_keyphrases)
pred_keyphrases_summaries_textrank=data_summaries['abstract'].apply(textrank_keyphrases)

In [30]:
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
# Tokenize the research paper text
def word_disambiguation(text):
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Apply WSD to disambiguate ambiguous terms
    disambiguated_tokens = []
    for token in filtered_tokens:
        # Apply WSD only to domain-specific terms or ambiguous terms
        if token in ['Java', 'algorithm','python', 'ruby', 'shell', 'kernel', 'address', 'query', 'table', 'object', 'stream']:  # Example ambiguous terms
            synset = lesk(filtered_tokens, token)
            if synset:
                disambiguated_tokens.append((token, synset.definition()))
            else:
                disambiguated_tokens.append((token, None))
        else:
            disambiguated_tokens.append((token, None))
    
    # Extract keyphrases based on disambiguated terms
    keyphrases = []
    for token, definition in disambiguated_tokens:
        if definition:
            keyphrase = f"{token} ({definition})"
        else:
            keyphrase = token
        keyphrases.append(keyphrase)
    return keyphrases

pred_keyphrases_abstract_wd = data_abstract['abstract'].apply( word_disambiguation)
pred_keyphrases_summaries_wd=data_summaries['abstract'].apply (word_disambiguation)

In [37]:
#using bert
from transformers import BertTokenizer, BertModel
# Set file names for data
file_abstract = '..\\data\\benchmark_data\\NUS.json'
file_summaries = '..\\data\\benchmark_data\\summarization_experiment\\NUS_summarized.csv'

# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))
data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")

# Combine title and abstract, preprocess text
def preprocess_text(text):
    # Apply contractions
    def get_contractions():
        contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because"}
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    # Substitute contractions with full words
    text = replace_contractions(text)

    # Remove brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    return text

data_abstract['abstract'] = data_abstract['abstract'].apply(preprocess_text)
data_summaries['abstract'] = data_summaries['abstract'].apply(preprocess_text)

In [38]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the text
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in data_abstract['abstract']]

# Pad sequences to a fixed length
max_length = max(len(tokens) for tokens in tokenized_texts)
padded_tokenized_texts = [tokens + [0] * (max_length - len(tokens)) for tokens in tokenized_texts]

# Convert token IDs to tensors
input_ids = torch.tensor(padded_tokenized_texts)

# Obtain BERT embeddings
with torch.no_grad():
    outputs = model(input_ids)
    bert_embeddings = outputs.last_hidden_state  # Extract embeddings from the last layer


def extract_keywords_bert(embeddings):

    avg_embeddings = np.mean(embeddings.numpy(), axis=1)
    
    # Perform keyword extraction using the averaged embeddings
    keywords = []
    for emb in avg_embeddings:
        # Example: Find top words based on the highest values in the embedding vector
        keywords.append(" ".join([str(token) for token in emb.argsort()[-5:]]))  # Extract top 5 tokens as keywords
    
    return keywords

# Apply keyword extraction function to BERT embeddings
extracted_keywords = extract_keywords_bert(bert_embeddings)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [42]:
pred_keyphrases_textrank_s = data_summaries['abstract'].apply(textrank_keyphrases)

In [44]:
pred_keyphrases2 = [pred_abstract + pred_summaries + pred_ner + pred_ner_s+pred_textrank+pred_textrank_s for pred_abstract, pred_summaries, pred_ner, pred_ner_s, pred_textrank, pred_textrank_s in zip(pred_keyphrases_abstract, pred_keyphrases_summaries, data_abstract['named_entities'],data_summaries['named_entities'], pred_keyphrases_textrank,pred_keyphrases_textrank_s)]

In [48]:
pred_final_keyphrases=[pred_k1+pred_k2 for pred_k1,pred_k2 in zip(pred_keyphrases,pred_keyphrases2)]