In [2]:
import ast
import json
import string
import re
import pandas as pd
import pke
import nltk
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pandas import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import traditional_evaluation
from numpy import savez_compressed

In [3]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Set file names for data
file_abstract = '..\\data\\benchmark_data\\NUS.json'
file_summaries = '..\\data\\benchmark_data\\summarization_experiment\\NUS_summarized.csv'

# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))
data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")


In [5]:
# Combine title and abstract, preprocess text
def preprocess_text(text):
    # Apply contractions
    def get_contractions():
        contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because"}
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    # Substitute contractions with full words
    text = replace_contractions(text)

    # Remove brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    return text

data_abstract['abstract'] = data_abstract['abstract'].apply(preprocess_text)
data_summaries['abstract'] = data_summaries['abstract'].apply(preprocess_text)


In [6]:
# Additional NLP processing
stemmer = PorterStemmer()

def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

# Tokenize and create n-grams
cv = CountVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))
cv.fit(data_abstract['abstract'])

X = cv.transform(data_abstract['abstract'])




In [8]:
# Extract keyphrases
def extract_keyphrases(data):
    extractor = pke.unsupervised.MultipartiteRank()
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    
    keyphrases = []
    for abstract in data['abstract']:
        extractor.load_document(input=abstract, normalization="stemming")
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average')
        pred_kps = extractor.get_n_best(n=10)
        keyphrases.append([kp[0].split() for kp in pred_kps])
    
    return keyphrases

pred_keyphrases_abstract = extract_keyphrases(data_abstract)
pred_keyphrases_summaries = extract_keyphrases(data_summaries)


In [9]:
# Combine abstract and summaries
data_summaries['abstract'] = data_abstract['abstract'] + ' ' + data_summaries['abstract']

In [11]:
# ======================================================================================================================
# GloVe Word Embeddings
# ======================================================================================================================

# Load GloVe word embeddings
gloveFile = 'GloVe\\glove.6B\\glove.6B.100d.txt'

print("Loading GloVe Model")
glove_model = {}
with open(gloveFile, 'r', encoding="utf8") as f:
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array(splitLine[1:], dtype='float32')
        glove_model[word] = embedding

print("Found %s word vectors." % len(glove_model))

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_abstract['abstract'])
word_index = tokenizer.word_index
max_sequence_length = max([len(seq) for seq in tokenizer.texts_to_sequences(data_abstract['abstract'])])

X_tokenized = tokenizer.texts_to_sequences(data_abstract['abstract'])
X_padded = pad_sequences(X_tokenized, maxlen=max_sequence_length, padding='post')

# Create embedding matrix
embedding_dim = 100
num_tokens = len(word_index)
embedding_matrix = np.zeros((num_tokens + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Save embedding matrix
np.savez_compressed('embedding_matrix.npz', embedding_matrix=embedding_matrix)


Loading GloVe Model
Found 400000 word vectors.


In [17]:
# Labeling words as keyword or non-keyword
count_KP = 0
count_KP_words = 0
count_NON_KP = 0
y = []
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

for index, abstract in enumerate(tqdm(data_abstract['abstract'])):
    abstract_word_labels = [0] * len(abstract)
    for i, word in enumerate(abstract):
        for keyphrase in data_abstract['keywords'][index]:  # Change 'keyword' to 'keywords'
            if stemmer.stem(word) == keyphrase[0]:  # Use stemmer.stem() to stem the word
                match_count = 1
                for j in range(1, len(keyphrase)):
                    if i + j < len(abstract):
                        if stemmer.stem(abstract[i + j]) == keyphrase[j]:  # Use stemmer.stem() here as well
                            match_count += 1
                        else:
                            break
                    else:
                        break
                if match_count == len(keyphrase):
                    for x in range(len(keyphrase)):
                        abstract_word_labels[i + x] = 1
                    count_KP += 1
                    break
        if not abstract_word_labels[i]:
            count_NON_KP += 1
    count_KP_words += abstract_word_labels.count(1)
    y.append(abstract_word_labels)



100%|████████████████████████████████████████████████████████████████████████████████| 211/211 [00:05<00:00, 41.93it/s]


In [18]:
# Evaluate
pred_keyphrases = [pred_abstract + pred_summaries for pred_abstract, pred_summaries in zip(pred_keyphrases_abstract, pred_keyphrases_summaries)]
gold_keyphrases = ast.literal_eval(data_summaries['keywords'].to_json(orient='values'))




In [19]:
traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data_summaries, x_filename='')

                                                 title  \
0    2-Source Dispersers for Sub-Polynomial Entropy...   
1    A Frequency-based and a Poisson-based Definiti...   
2                     High Performance Crawling System   
3    Hiperlan/2 Public Access Interworking with 3G ...   
4                              2D Information Displays   
..                                                 ...   
206                   Formally Deriving an STG Machine   
207      Building Bridges for Web Query Classification   
208      Geographically Focused Collaborative Crawling   
209  GraalBench: A 3D Graphics Benchmark Suite for ...   
210  Handoff Trigger Table for Integrated 3G/WLAN N...   

                                              abstract  \
0    [The, main, result, of, this, paper, is, an, e...   
1    [This, paper, reports, on, theoretical, invest...   
2    [In, the, present, paper,, we, will, describe,...   
3    [This, paper, presents, a, technical, overview...   
4    [Many, e

In [20]:
print(gold_keyphrases)

['sum-product theorem;distribution;explicit disperser;construction of disperser;Extractors;recursion;subsource somewhere extractor;structure;bipartite graph;extractors;independent sources;extractor;tools;Ramsey Graphs;disperser;polynomial time computable disperser;resiliency;Theorem;Ramsey graphs;block-sources;deficiency;termination;entropy;Ramsey graph;Independent Sources;algorithms;independent source;subsource;Dispersers;randomness extraction', 'inverse document frequency (idf);independent and disjoint documents;computer science;information search;probability theories;Poisson based probability;Term frequency;probabilistic retrieval models;Probability of being informative;Independent documents;Disjoint documents;Normalisation;relevance-based ranking of retrieved objects;information theory;Noise probability;frequency-based term noise probability;Poisson-based probability of being informative;Assumptions;Collection space;Poisson distribution;Probabilistic information retrieval;Document 

In [21]:
print(pred_keyphrases)

[[['independent', 'sources'], ['extractor', 'ideas'], ['bipartite', 'graphs'], ['extractor'], ['explicit', 'construction'], ['entropy', 'rate'], ['main', 'result'], ['adjacency', 'matrices'], ['classical'], ['explicit', 'disperser'], ['independent', 'sources'], ['extractor', 'ideas'], ['extractor'], ['entropy', 'rate'], ['bipartite', 'graphs'], ['explicit', 'construction'], ['extraction'], ['year', 'record'], ['boolean', 'matrices'], ['mechanism']], [['inverse', 'document', 'frequency'], ['informative', 'assumes'], ['probability'], ['idf'], ['assumptions'], ['term'], ['theoretical', 'investigations'], ['intuitive', 'idf', '-based', 'probability', 'function'], ['paper', 'reports'], ['document', 'events'], ['inverse', 'document', 'frequency'], ['idf'], ['term'], ['documents'], ['informative', 'assumes'], ['parameter', 'estimation'], ['useful'], ['hard', 'copies'], ['combination'], ['probability']], [['system'], ['web', 'crawling'], ['real-time'], ['machines'], ['cluster'], ['implementati