In [2]:
import ast
import json
import string
import re
import pandas as pd
import pke
import nltk
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pandas import json_normalize
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import traditional_evaluation
from numpy import savez_compressed
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx


In [3]:
nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')

# Set file names for data
file_abstract = '..\\data\\benchmark_data\\NUS.json'
file_summaries = '..\\data\\benchmark_data\\summarization_experiment\\NUS_summarized.csv'


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simrath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Read data
json_data = []
for line in open(file_abstract, 'r', encoding="utf8"):
    json_data.append(json.loads(line))
data_abstract = json_normalize(json_data)
data_summaries = pd.read_csv(file_summaries, encoding="utf8")

# Combine title and abstract, preprocess text
def preprocess_text(text):
    # Apply contractions
    def get_contractions():
        contraction_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because"}
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = get_contractions()

        def replace(match):
            return contractions[match.group(0)]

        return contractions_re.sub(replace, text)

    # Substitute contractions with full words
    text = replace_contractions(text)

    # Remove brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    return text

data_abstract['abstract'] = data_abstract['abstract'].apply(preprocess_text)
data_summaries['abstract'] = data_summaries['abstract'].apply(preprocess_text)


In [5]:
# Additional NLP processing
stemmer = PorterStemmer()

def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

# Tokenize and create n-grams
cv = CountVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 2))
cv.fit(data_abstract['abstract'])

X = cv.transform(data_abstract['abstract'])




In [7]:
# Extract keyphrases
def extract_keyphrases(data):
    extractor = pke.unsupervised.MultipartiteRank()
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    
    keyphrases = []
    for abstract in data['abstract']:
        extractor.load_document(input=abstract, normalization="stemming")
        extractor.candidate_selection(pos=pos)
        extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average')
        pred_kps = extractor.get_n_best(n=10)
        keyphrases.append([kp[0].split() for kp in pred_kps])
    
    return keyphrases

pred_keyphrases_abstract = extract_keyphrases(data_abstract)
pred_keyphrases_summaries = extract_keyphrases(data_summaries)

# Combine abstract and summaries
data_summaries['abstract'] = data_abstract['abstract'] + ' ' + data_summaries['abstract']


In [8]:
# ======================================================================================================================
# Named Entity Recognition (NER)
# ======================================================================================================================
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities

data_abstract['named_entities'] = data_abstract['abstract'].apply(extract_named_entities)
data_summaries['named_entities'] = data_summaries['abstract'].apply(extract_named_entities)


In [9]:
# ======================================================================================================================
# Semantic Role Labeling (SRL)
# ======================================================================================================================
# Semantic Role Labeling is not directly supported by spaCy, so it would require additional libraries or models.

# ======================================================================================================================
# Semantic Similarity
# ======================================================================================================================
def calculate_semantic_similarity(texts):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

# Combine abstracts and summaries for semantic similarity calculation
combined_texts = data_abstract['abstract'].tolist() + data_summaries['abstract'].tolist()
semantic_similarity_matrix = calculate_semantic_similarity(combined_texts)


In [10]:
# ======================================================================================================================
# Topic Modeling
# ======================================================================================================================
def perform_topic_modeling(texts):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
    lda_model.fit(tfidf_matrix)
    return lda_model

# Fit LDA model on combined abstracts and summaries
lda_model = perform_topic_modeling(combined_texts)


In [11]:
# ======================================================================================================================
# Graph-based Methods (TextRank)
# ======================================================================================================================
def textrank_keyphrases(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    keyword_phrases = []

    for sentence in sentences:
        words = sentence.split()
        graph = nx.Graph()

        # Add nodes to the graph
        for word in words:
            graph.add_node(word)

        # Add edges to the graph
        for i in range(len(words)):
            for j in range(i+1, len(words)):
                similarity = semantic_similarity_matrix[i][j]
                if similarity > 0.2:  # Threshold for edge creation
                    graph.add_edge(words[i], words[j])

        # Calculate TextRank scores
        scores = nx.pagerank(graph)

        # Select top keywords based on TextRank scores
        keywords = sorted(scores, key=scores.get, reverse=True)[:5]
        keyword_phrases.append(keywords)

    return keyword_phrases

pred_keyphrases_textrank = data_abstract['abstract'].apply(textrank_keyphrases)


In [1]:
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
# Tokenize the research paper text
def word_disambiguation(text):
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Apply WSD to disambiguate ambiguous terms
    disambiguated_tokens = []
    for token in filtered_tokens:
        # Apply WSD only to domain-specific terms or ambiguous terms
        if token in ['Java', 'algorithm','python', 'ruby', 'shell', 'kernel', 'address', 'query', 'table', 'object', 'stream']:  # Example ambiguous terms
            synset = lesk(filtered_tokens, token)
            if synset:
                disambiguated_tokens.append((token, synset.definition()))
            else:
                disambiguated_tokens.append((token, None))
        else:
            disambiguated_tokens.append((token, None))
    
    # Extract keyphrases based on disambiguated terms
    keyphrases = []
    for token, definition in disambiguated_tokens:
        if definition:
            keyphrase = f"{token} ({definition})"
        else:
            keyphrase = token
        keyphrases.append(keyphrase)
    return keyphrases



In [12]:
pred_keyphrases = [pred_abstract + pred_summaries + pred_ner + pred_textrank for pred_abstract, pred_summaries, pred_ner, pred_textrank in zip(pred_keyphrases_abstract, pred_keyphrases_summaries, data_abstract['named_entities'], pred_keyphrases_textrank)]
gold_keyphrases = ast.literal_eval(data_summaries['keywords'].to_json(orient='values'))

traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data_summaries, x_filename='')



                                                 title  \
0    2-Source Dispersers for Sub-Polynomial Entropy...   
1    A Frequency-based and a Poisson-based Definiti...   
2                     High Performance Crawling System   
3    Hiperlan/2 Public Access Interworking with 3G ...   
4                              2D Information Displays   
..                                                 ...   
206                   Formally Deriving an STG Machine   
207      Building Bridges for Web Query Classification   
208      Geographically Focused Collaborative Crawling   
209  GraalBench: A 3D Graphics Benchmark Suite for ...   
210  Handoff Trigger Table for Integrated 3G/WLAN N...   

                                              abstract  \
0    [The, main, result, of, this, paper, is, an, e...   
1    [This, paper, reports, on, theoretical, invest...   
2    [In, the, present, paper,, we, will, describe,...   
3    [This, paper, presents, a, technical, overview...   
4    [Many, e

In [13]:
pred_keyphrases = [pred_abstract + pred_summaries + pred_ner + pred_textrank for pred_abstract, pred_summaries, pred_ner, pred_textrank in zip(pred_keyphrases_abstract, pred_keyphrases_summaries, data_abstract['named_entities'], pred_keyphrases_textrank)]
gold_keyphrases = ast.literal_eval(data_summaries['keywords'].to_json(orient='values'))


['sum-product theorem;distribution;explicit disperser;construction of disperser;Extractors;recursion;subsource somewhere extractor;structure;bipartite graph;extractors;independent sources;extractor;tools;Ramsey Graphs;disperser;polynomial time computable disperser;resiliency;Theorem;Ramsey graphs;block-sources;deficiency;termination;entropy;Ramsey graph;Independent Sources;algorithms;independent source;subsource;Dispersers;randomness extraction', 'inverse document frequency (idf);independent and disjoint documents;computer science;information search;probability theories;Poisson based probability;Term frequency;probabilistic retrieval models;Probability of being informative;Independent documents;Disjoint documents;Normalisation;relevance-based ranking of retrieved objects;information theory;Noise probability;frequency-based term noise probability;Poisson-based probability of being informative;Assumptions;Collection space;Poisson distribution;Probabilistic information retrieval;Document 

In [14]:
print(pred_keyphrases)

[[['independent', 'sources'], ['extractor', 'ideas'], ['bipartite', 'graphs'], ['extractor'], ['explicit', 'construction'], ['entropy', 'rate'], ['main', 'result'], ['adjacency', 'matrices'], ['classical'], ['explicit', 'disperser'], ['independent', 'sources'], ['extractor', 'ideas'], ['extractor'], ['entropy', 'rate'], ['bipartite', 'graphs'], ['explicit', 'construction'], ['extraction'], ['year', 'record'], ['boolean', 'matrices'], ['mechanism'], 'two', 'N', 'N Boolean', 'no K  K', 'K-Ramsey', 'N', 'Barak', 'Kindler', 'Shaltiel', 'Sudakov', 'Wigderson', 'the -year', 'Ramsey', 'Frankl', 'Wilson', 'the last couple of years', 'Bourgain', 'one', 'entropy rate &gt', 'Rao', 'The "Challenge-Response', ['of', 'is', 'an', 'n', 'The'], ['N', 'and', 'K', 'for', 'Put'], ['graphs,', 'this', 'graphs', 'N', 'Viewed'], ['k', 'and', 'This', 'greatly', 'improves'], ['It', 'also', 'significantly', 'improves', 'the'], ['of', '.', '=', '~', 'O(n)'], ['years', "Raz's", 'independent', 'any', 'almost'], ['T