In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install matplotlib
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-zxdnga_p
  Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-zxdnga_p
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 2.1 MB/s 
Building wheels for collected packages: pke, sklearn
  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160276 sha256=69d251d456dc9a3d4db481bcd3e02070421233bf65a2a1bf8cc3796f96105c89
  Stored in directory: /tmp/pip-ephem-wheel-cache-k6wm_yhm/wheels/fa/b3/09/612ee93bf3ee4164bcd5783e742942cdfc892a86039d3e0a33
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pke

In [None]:
# sample document (1895.abstr from the Inspec dataset)
sample = """An algorithm combining neural networks with fundamental parameters.
An algorithm combining neural networks with the fundamental parameters equations (NNFP) is proposed for making
corrections for non-linear matrix effects in x-ray fluorescence analysis. In the algorithm, neural networks were
applied to relate the concentrations of components to both the measured intensities and the relative theoretical
intensities calculated by the fundamental parameter equations. The NNFP algorithm is compared with the classical
theoretical correction models, including the fundamental parameters approach, the Lachance-Traill model, a
hyperbolic function model and the COLA algorithm. For an alloy system with 15 measured elements, in most cases,
the prediction errors of the NNFP algorithm are lower than those of the fundamental parameters approach, the
Lachance-Traill model, the hyperbolic function model and the COLA algorithm separately. If there are the serious
matrix effects, such as matrix effects among Cr, Fe and Ni, the NNFP algorithm generally decreased predictive
errors as compared with the classical models, except for the case of Cr by the fundamental parameters approach.
The main reason why the NNFP algorithm has generally a better predictive ability than the classical theoretical
correction models might be that neural networks can better calibrate the non-linear matrix effects in a complex
multivariate system.""".replace("\n", " ")

# initialize a simple model that ranks candidates using their position
extractor = pke.unsupervised.TopicRank()
# load the document using the initialized model
extractor.load_document(input=sample, language='en')

In [None]:

# for each sentence in the document
for i, sentence in enumerate(extractor.sentences):
    
    # print out the sentence id, its tokens, its stems and the corresponding Part-of-Speech tags
    print("sentence {}:".format(i))
    print(" - words: {} ...".format(' '.join(sentence.words[:])))
    print(" - stems: {} ...".format(' '.join(sentence.stems[:5])))
    print(" - PoS: {} ...".format(' '.join(sentence.pos[:5])))

sentence 0:
 - words: An algorithm combining neural networks with fundamental parameters . ...
 - stems: an algorithm combin neural network ...
 - PoS: DET NOUN VERB ADJ NOUN ...
sentence 1:
 - words: An algorithm combining neural networks with the fundamental parameters equations ( NNFP ) is proposed for making corrections for non-linear matrix effects in x-ray fluorescence analysis . ...
 - stems: an algorithm combin neural network ...
 - PoS: DET NOUN VERB ADJ NOUN ...
sentence 2:
 - words: In the algorithm , neural networks were applied to relate the concentrations of components to both the measured intensities and the relative theoretical intensities calculated by the fundamental parameter equations . ...
 - stems: in the algorithm , neural ...
 - PoS: ADP DET NOUN PUNCT ADJ ...
sentence 3:
 - words: The NNFP algorithm is compared with the classical theoretical correction models , including the fundamental parameters approach , the Lachance-Traill model , a hyperbolic function mod

In [None]:
# identify the keyphrase candidates using TopicRank's default strategy
# i.e. the longest sequences of nouns and adjectives `(Noun|Adj)*`
extractor.candidate_selection()

In [None]:
extractor.candidates

defaultdict(pke.data_structures.Candidate,
            {'algorithm': Candidate(),
             'neural network': Candidate(),
             'fundament paramet': Candidate(),
             'fundament paramet equat': Candidate(),
             'nnfp': Candidate(),
             'correct': Candidate(),
             'non-linear matrix effect': Candidate(),
             'x-ray fluoresc analysi': Candidate(),
             'concentr': Candidate(),
             'compon': Candidate(),
             'measur intens': Candidate(),
             'rel theoret intens': Candidate(),
             'nnfp algorithm': Candidate(),
             'classic theoret correct model': Candidate(),
             'lachance-trail model': Candidate(),
             'hyperbol function model': Candidate(),
             'cola algorithm': Candidate(),
             'alloy system': Candidate(),
             'element': Candidate(),
             'predict error': Candidate(),
             'lower': Candidate(),
             'matrix effe

In [None]:
# identifying keyphrase candidates populates the extractor.candidates dictionary
# let's have a look at the keyphrase candidates

# for each keyphrase candidate
for i, candidate in enumerate(extractor.candidates):
    
    # print out the candidate id, its stemmed form 
    print("candidate {}: {} (stemmed form)".format(i, candidate))
    
    # print out the surface forms of the candidate
    print(" - surface forms:", [ " ".join(u) for u in extractor.candidates[candidate].surface_forms])
    
    # print out the corresponding offsets
    #print(" - offsets:", extractor.candidates[candidate].offsets)
    
    # print out the corresponding sentence ids
    #print(" - sentence_ids:", extractor.candidates[candidate].sentence_ids)
    
    # print out the corresponding PoS patterns
    print(" - pos_patterns:", extractor.candidates[candidate].pos_patterns)

candidate 0: algorithm (stemmed form)
 - surface forms: ['algorithm', 'algorithm', 'algorithm']
 - pos_patterns: [['NOUN'], ['NOUN'], ['NOUN']]
candidate 1: neural network (stemmed form)
 - surface forms: ['neural networks', 'neural networks', 'neural networks', 'neural networks']
 - pos_patterns: [['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN']]
candidate 2: fundament paramet (stemmed form)
 - surface forms: ['fundamental parameters', 'fundamental parameters', 'fundamental parameters', 'fundamental parameters']
 - pos_patterns: [['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN']]
candidate 3: fundament paramet equat (stemmed form)
 - surface forms: ['fundamental parameters equations', 'fundamental parameter equations']
 - pos_patterns: [['ADJ', 'NOUN', 'NOUN'], ['ADJ', 'NOUN', 'NOUN']]
candidate 4: nnfp (stemmed form)
 - surface forms: ['NNFP']
 - pos_patterns: [['PROPN']]
candidate 5: correct (stemmed form)
 - surface forms: ['corrections']
 - pos_p

In [None]:
extractor.candidate_weighting()

In [None]:
extractor.topics

[['measur intens'],
 ['rel theoret intens'],
 ['classic model', 'classic theoret correct model', 'lachance-trail model'],
 ['hyperbol function model'],
 ['correct'],
 ['predict abil', 'predict error'],
 ['matrix effect', 'non-linear matrix effect'],
 ['algorithm', 'cola algorithm'],
 ['nnfp', 'nnfp algorithm'],
 ['fundament paramet', 'fundament paramet equat'],
 ['alloy system'],
 ['complex multivari system'],
 ['case'],
 ['compon'],
 ['concentr'],
 ['element'],
 ['lower'],
 ['main reason'],
 ['neural network'],
 ['x-ray fluoresc analysi']]

In [None]:
for i, topic in enumerate(extractor.topics):
    
    # print out the topic id and the candidates it groups together
    print("topic {}: {} ".format(i, ';'.join(topic)))

topic 0: measur intens 
topic 1: rel theoret intens 
topic 2: classic model;classic theoret correct model;lachance-trail model 
topic 3: hyperbol function model 
topic 4: correct 
topic 5: predict abil;predict error 
topic 6: matrix effect;non-linear matrix effect 
topic 7: algorithm;cola algorithm 
topic 8: nnfp;nnfp algorithm 
topic 9: fundament paramet;fundament paramet equat 
topic 10: alloy system 
topic 11: complex multivari system 
topic 12: case 
topic 13: compon 
topic 14: concentr 
topic 15: element 
topic 16: lower 
topic 17: main reason 
topic 18: neural network 
topic 19: x-ray fluoresc analysi 


In [None]:
# let's have a look at the weights/ranks of the topics

# In TopicRank, weights are computed for each topic, and only one
# representative candidate per topic (by default the first occurring
# one) is kept

# for each representative candidate
for candidate, weight in extractor.weights.items():
    
    # print out the candidate (in stemmed form) and its weight
    print('{}: {}'.format(candidate, weight))

measur intens: 0.030046327245108993
rel theoret intens: 0.029183590002558775
classic theoret correct model: 0.09212888137678357
hyperbol function model: 0.04722314625080061
correct: 0.02954851748304181
predict error: 0.05826811246169494
non-linear matrix effect: 0.06160976716265
algorithm: 0.10129306084734259
nnfp: 0.1016672672752223
fundament paramet: 0.12148599748134195
alloy system: 0.026858039884651526
complex multivari system: 0.018915403315796886
case: 0.022538485225400532
compon: 0.03203632824183242
concentr: 0.03126608009457482
element: 0.025366989363370306
lower: 0.02847805842257687
main reason: 0.024726547192895278
neural network: 0.08561594954893767
x-ray fluoresc analysi: 0.031743451123418316


In [None]:
extractor.weights.items()

dict_items([('measur intens', 0.030046327245108993), ('rel theoret intens', 0.029183590002558775), ('classic theoret correct model', 0.09212888137678357), ('hyperbol function model', 0.04722314625080061), ('correct', 0.02954851748304181), ('predict error', 0.05826811246169494), ('non-linear matrix effect', 0.06160976716265), ('algorithm', 0.10129306084734259), ('nnfp', 0.1016672672752223), ('fundament paramet', 0.12148599748134195), ('alloy system', 0.026858039884651526), ('complex multivari system', 0.018915403315796886), ('case', 0.022538485225400532), ('compon', 0.03203632824183242), ('concentr', 0.03126608009457482), ('element', 0.025366989363370306), ('lower', 0.02847805842257687), ('main reason', 0.024726547192895278), ('neural network', 0.08561594954893767), ('x-ray fluoresc analysi', 0.031743451123418316)])

In [None]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

In [None]:
 candidates = pd.DataFrame(extractor.weights.items(), columns=['Candidate', 'Weight'])

In [None]:
candidates.sort_values(by='Weight' , ascending = False )

Unnamed: 0,Candidate,Weight
9,fundament paramet,0.121486
8,nnfp,0.101667
7,algorithm,0.101293
2,classic theoret correct model,0.092129
18,neural network,0.085616
6,non-linear matrix effect,0.06161
5,predict error,0.058268
3,hyperbol function model,0.047223
13,compon,0.032036
19,x-ray fluoresc analysi,0.031743


In [None]:
candidates.reset_index(inplace = True , drop = True)


In [None]:
candidates

Unnamed: 0,Candidate,Weight
0,measur intens,0.030046
1,rel theoret intens,0.029184
2,classic theoret correct model,0.092129
3,hyperbol function model,0.047223
4,correct,0.029549
5,predict error,0.058268
6,non-linear matrix effect,0.06161
7,algorithm,0.101293
8,nnfp,0.101667
9,fundament paramet,0.121486


In [None]:
def evaluate(top_N_keyphrases, references):
    P = len(set(top_N_keyphrases) & set(references)) / len(top_N_keyphrases)
    R = len(set(top_N_keyphrases) & set(references)) / len(references)
    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 
    return (P, R, F)

In [None]:
# gold-standard keyphrases for the sample document (1895.abstr, keyphrases are in stemmed form)
references = ['algorithm', 'neural network', 'fundament paramet', 'fundament paramet equat',
              'nonlinear matrix effect', 'x-ray fluoresc analysi', 'intens', 'nnfp algorithm',
              'theoret correct model', 'lachance-trail model', 'hyperbol function model',
              'cola algorithm', 'alloy system', 'cr', 'fe', 'ni', 'complex multivari system']

In [None]:
keyphrases = extractor.get_n_best(n=5, stemming=True)

top5 = [candidate for candidate, weight in keyphrases]

In [None]:
top5

['fundament paramet',
 'nnfp',
 'algorithm',
 'classic theoret correct model',
 'neural network']

In [None]:
keyphrases = extractor.get_n_best(n=5, stemming=False)

# for each of the best candidates
for i, (candidate, score) in enumerate(keyphrases):
    
    # print out the its rank, phrase and score
    print("rank {}: {} ({})".format(i, candidate, score))

rank 0: fundamental parameters (0.12148599748134195)
rank 1: nnfp (0.1016672672752223)
rank 2: algorithm (0.10129306084734259)
rank 3: classical theoretical correction models (0.09212888137678357)
rank 4: neural networks (0.08561594954893767)


In [None]:
P, R, F = evaluate(top5, references)


In [None]:
print("P@5: {:.3f} R@5: {:.3f} F@5: {:.3f}".format(P, R, F))

P@5: 0.600 R@5: 0.176 F@5: 0.273


In [None]:
keyphrases = extractor.get_n_best(n=20, stemming=True)

top20 = [candidate for candidate, weight in keyphrases]

In [None]:
P20, R20, F20 = evaluate(top20, references)


In [None]:
print("P@2: {:.3f} R@20: {:.3f} F@20: {:.3f}".format(P20, R20, F20))

P@2: 0.350 R@20: 0.412 F@20: 0.378


In [None]:
def max_recall(candidates, references):
    return len(set(references) & set(candidates)) / len(set(references))

In [None]:

# let's see how many candidates are identified
print("{} keyphrase candidates were identified".format(len(extractor.candidates)))

# print out a sample
candidates = [*extractor.candidates]
print("- Subsample of candidates:", ' ; '.join(candidates[:5]))

# compute the maximum recall
print("- Maximum recall: {:.3f}".format(max_recall(candidates, references)))

# identify missed reference keyphrases
missed = set(references) - set(candidates)
print("- Missed reference keyphrases: {}".format(missed))

27 keyphrase candidates were identified
- Subsample of candidates: algorithm ; neural network ; fundament paramet ; fundament paramet equat ; nnfp
- Maximum recall: 0.647
- Missed reference keyphrases: {'nonlinear matrix effect', 'intens', 'cr', 'theoret correct model', 'ni', 'fe'}


In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 2.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 49.0 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 65.6 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 49.8 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 59.3 MB/s 
Installing collected p

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('taln-ls2n/inspec')
#hugging face

Downloading builder script:   0%|          | 0.00/6.54k [00:00<?, ?B/s]



Downloading and preparing dataset inspec/raw to /root/.cache/huggingface/datasets/taln-ls2n___inspec/raw/1.1.0/0ae146cabe770846946b3279b4c751efe0aca2dd68b3f24427d4624cd22bb20d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/971k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/946k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset inspec downloaded and prepared to /root/.cache/huggingface/datasets/taln-ls2n___inspec/raw/1.1.0/0ae146cabe770846946b3279b4c751efe0aca2dd68b3f24427d4624cd22bb20d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'keyphrases', 'prmu'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'keyphrases', 'prmu'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'keyphrases', 'prmu'],
        num_rows: 500
    })
})

In [None]:
sample = dataset["train"][0]


In [None]:
sample['keyphrases']
"synergic" in ", ".join(sample['keyphrases']).lower()


True

In [None]:
for i in range(0,20):
  sample = dataset["train"][i]
  if "net" in ", ".join(sample['keyphrases']).lower():
    print(i,sample['title'])

0 Towards a NMR implementation of a quantum lattice gas algorithm
11 Dot-Net makes slow progress
12 Distributed servers approach for large-scale secure multicast
15 Grey-box model identification via evolutionary computing
16 A genetic approach to the optimization of automatic generation control parameters for power systems
18 Multi-agent collaboration for B2B workflow monitoring


In [None]:
text = dataset["train"][i]['abstract']

In [None]:
text

"As the central nervous system for managing an organization's mission and critical business data, Enterprise Resource Planning (ERP) system has evolved to become the backbone of e-business implementation. Since an ERP system is multimodule application software that helps a company manage its important business functions, it should be versatile enough to automate every aspect of business processes, including e-business"

In [None]:
import re
import spacy
from tqdm.notebook import tqdm
from nltk.stem.snowball import SnowballStemmer as Stemmer

In [None]:
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER


In [None]:
nlp = spacy.load("en_core_web_sm")


In [None]:
nlp = spacy.load("en_core_web_sm")

# populates a docs list with spacy doc objects
docs = []
for sample in tqdm(dataset['test']):
    docs.append(nlp(sample["title"]+". "+sample["abstract"]))

# populates the references list with stemmed keyphrases
references = []
for sample in tqdm(dataset['test']):
    sample_keyphrases = []
    for keyphrase in sample["keyphrases"]:
        
        # tokenize keyphrase
        tokens = [token.text for token in nlp(keyphrase)]
        # normalize tokens using Porter's stemming
        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]
        sample_keyphrases.append(" ".join(stems))
    references.append(sample_keyphrases)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
references

[['high - fidel finit element model',
  'kidney',
  'trauma research',
  'nation librari of medicin',
  'imag segment',
  'organ reconstruct',
  'softwar packag',
  '2d vhf imag',
  '3d polygon represent',
  'nurb',
  'polygon surfac',
  '3d hexahedr finit element mesh',
  'hyperelast materi model',
  'biolog soft tissu',
  'biomechan research',
  'visibl human femal project',
  'medic data set',
  'physic base anim',
  'nonuniform ration b - spline surfac',
  'viscoelast model'],
 ['hybrid simul',
  'space plasma',
  'massless fluid represent',
  'kelvin - helmholtz instabl',
  'transit layer',
  'tangenti discontinu',
  'pressur anisotropi',
  'electromagnet hybrid model',
  'three - dimension hybrid simul',
  'magnetopaus shear layer',
  'field revers layer',
  'magnet plasma flow',
  'cylindr plasma sourc'],
 ['freeli avail java softwar',
  'databas connect',
  'imag',
  'detail stori problem',
  'feedback',
  'multipl - choic question',
  'menu select',
  'button press',
  'indivi

In [None]:

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

# populates a docs list with spacy doc objects
docs = []
for sample in tqdm(dataset['test']):
    docs.append(nlp(sample["title"]+". "+sample["abstract"]))

# populates the references list with stemmed keyphrases
references = []
for sample in tqdm(dataset['test']):
    sample_keyphrases = []
    for keyphrase in sample["keyphrases"]:
        # tokenize keyphrase
        tokens = [token.text for token in nlp(keyphrase)]
        # normalize tokens using Porter's stemming
        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]
        sample_keyphrases.append(" ".join(stems))
    references.append(sample_keyphrases)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
#references

In [None]:
from tqdm.notebook import tqdm
from datasets import load_dataset

benchmark = "inspec"
# load the inspec dataset

# pre-process training and test splits
train = []
for sample in tqdm(dataset['train']):
    train.append(nlp(sample["title"]+". "+sample["abstract"]))
test = []
for sample in tqdm(dataset['test']):
    test.append(nlp(sample["title"]+". "+sample["abstract"]))

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
from pke import compute_document_frequency, compute_lda_model
from string import punctuation


# computing DF counts

compute_document_frequency(
    documents=train,
    output_file='data/{}.df.gz'.format(benchmark),
    language='en',              # language of the input files
    normalization='stemming',   # use porter stemmer
    stoplist=list(punctuation), # stoplist (punctuation marks)
    n=5                         # compute n-grams up to 5-grams
)

In [None]:
from pke import load_document_frequency_file

df = load_document_frequency_file(input_file='data/{}.df.gz'.format(benchmark))



In [None]:
from pke.unsupervised import *

outputs = {}
for model in [FirstPhrases, TextRank, SingleRank, TopicRank, PositionRank, MultipartiteRank ]:
    outputs[model.__name__] = []
    
    extractor = model()
    for i, doc in enumerate(tqdm(docs)):
      if not model.__name__ == 'MultipartiteRank':
        extractor.load_document(input=doc, language='en')
        extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
        extractor.candidate_weighting()
      else:
        extractor.load_document(input=doc)
        #    not contain punctuation marks or stopwords as candidates.
        #pos = {'PROPN'}
        pos = {'VERB', 'ADJ', 'NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos)
        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.5,
                                      method='average')
      outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=10, stemming=True)])
    

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
outputs

{'FirstPhrases': [['creation',
   'high-fidel finit element model',
   'kidney',
   'use',
   'trauma research',
   'detail finit element model',
   'human kidney',
   'nation librari',
   'medicin visibl human femal',
   'vhf'],
  ['hybrid simul',
   'space plasma',
   'model',
   'massless fluid represent',
   'electron',
   'iv',
   'kelvin-helmholtz instabl',
   'pt . iii',
   'prikl',
   'mat'],
  ['on-lin homework',
   'quiz',
   'exam applet',
   'avail java softwar',
   'perform',
   'line',
   'homework',
   'avail java program',
   'student perform',
   'content'],
  ['conceptu framework',
   'evalu',
   'inform technolog invest',
   'decis',
   'new inform technolog',
   'number',
   'seriou evalu',
   'select problem',
   'technolog manag',
   'new system'],
  ['enterpris',
   'microsoft project',
   'long-tim favorit',
   'project manag',
   'enterpris debut',
   'new web-bas collabor tool',
   'scalabl',
   'olap support',
   'multipl web project',
   'dispar workgroup'],

In [None]:
import numpy as np

def evaluate(top_N_keyphrases, references):
    P = len(set(top_N_keyphrases) & set(references)) / len(top_N_keyphrases)
    R = len(set(top_N_keyphrases) & set(references)) / len(references)
    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 
    accuracy = len(set(top_N_keyphrases) & set(references)) / len(set(top_N_keyphrases)) if len(top_N_keyphrases) else 0
    return (P, R, F , accuracy)

# loop through the models
for model in outputs:
    
    # compute the P, R, F scores for the model
    scores = []
    for i, output in enumerate(tqdm(outputs[model])):
        scores.append(evaluate(output, references[i]))
    
    # compute the average scores
    avg_scores = np.mean(scores, axis=0)
    
    # print out the performance of the model
    print("Model: {} P@5: {:.3f} R@5: {:.3f} F@5: {:.3f} acc: {:.3f}".format(model, avg_scores[0], avg_scores[1], avg_scores[2] , avg_scores[3]))

  0%|          | 0/500 [00:00<?, ?it/s]

Model: FirstPhrases P@5: 0.279 R@5: 0.331 F@5: 0.286 acc: 0.279


  0%|          | 0/500 [00:00<?, ?it/s]

Model: TextRank P@5: 0.341 R@5: 0.392 F@5: 0.344 acc: 0.341


  0%|          | 0/500 [00:00<?, ?it/s]

Model: SingleRank P@5: 0.339 R@5: 0.395 F@5: 0.344 acc: 0.339


  0%|          | 0/500 [00:00<?, ?it/s]

Model: TopicRank P@5: 0.281 R@5: 0.316 F@5: 0.282 acc: 0.281


  0%|          | 0/500 [00:00<?, ?it/s]

Model: PositionRank P@5: 0.326 R@5: 0.383 F@5: 0.332 acc: 0.326


  0%|          | 0/500 [00:00<?, ?it/s]

Model: MultipartiteRank P@5: 0.216 R@5: 0.248 F@5: 0.217 acc: 0.216


In [None]:
my_txt = "The Nile River fed Egyptian civilization for hundreds of years. It begins near the equator in Africa and flows north to the Mediterranean Sea. A delta is an area near a river’s mouth where the water deposits fine soil called silt. This soil was fertile, which means it was good for growing crops. The red land was the barren desert beyond the fertile region. When the birds arrived, the annual flood waters would soon follow. Then they used a tool called a shaduf to spread the water across the fields. These innovative, or new, techniques gave them more farmland. They were the first to grind wheat into flour and to mix the flour with yeast and water to make dough rise into bread. Egyptians often painted walls white to reflect the blazing heat. Poorer Egyptians simply went to the roof to cool off after sunset. Even during the cool season, chipping minerals out of the rock was miserable work. One ancient painting even shows a man ready to hit a catfish with a wooden hammer. A boomerang is a curved stick that returns to the person who threw it.) The river’s current was slow, so boaters used paddles to go faster when they traveled north with the current. Going south, they raised a sail and let the winds that blew in that direction push them. The Nile provided so well for Egyptians that sometimes they had surpluses, or more goods than they needed. Ancient Egypt had no money, so people exchanged goods that they grew or made. This prosperity made life easier and provided greater opportunities for many Egyptians. For example, some ancient Egyptians learned to be scribes, people whose job was to write and keep records. Some skilled artisans erected stone or brick houses and temples. A few Egyptians traveled to the upper Nile to trade with other Africans. They brought back exotic woods, animal skins, and live beasts. Egyptians created a government that divided the empire into 42 provinces. Many officials worked to keep the provinces running smoothly. Priests followed formal rituals and took care of the temples. Before entering a temple, a priest bathed and put on special linen garments and white sandals. Together, the priests and the ruler held ceremonies to please the gods. By doing so, they hoped to maintain the social and political order. In Egypt, people became slaves if they owed a debt, committed a crime, or were captured in war. Unlike other ancient African cultures, in Egyptian society men and women had fairly equal rights. For example, they could both own and manage their own property. Children in Egypt played with toys such as dolls, animal figures, board games, and marbles. Almost all Egyptians married when they were in their early teens. As in many ancient societies, much of the knowledge of Egypt came about as priests studied the world to find ways to please the gods. Doctors believed that the heart controlled thought and the brain circulated blood, which is the opposite of what is known now. Early Egyptians created a hieroglyphic system with about 700 characters. Legend says a king named Narmer united Upper and Lower Egypt. Some historians think Narmer actually represents several kings who gradually joined the two lands. It combined the red Crown of Lower Egypt with the white Crown of Upper Egypt. When a king died, one of his children usually took his place as ruler. Historians divide ancient Egyptian dynasties into the Old Kingdom, the Middle Kingdom, and the New Kingdom. The Old Kingdom started about 2575 B.C., when the Egyptian empire was gaining strength. In such a case, a rival might drive him from power and start a new dynasty. The first rulers of Egypt were often buried in an underground tomb topped by mud brick. They replaced the mud brick with a small pyramid of brick or stone. It is called a step pyramid because its sides rise in a series of giant steps. He ordered the construction of the largest pyramid ever built. One reason is that the pyramids drew attention to the tombs inside them. Grave robbers broke into the tombs to steal the treasure buried with the pharaohs. Egyptians believed that if a tomb was robbed, the person buried there could not have a happy afterlife. This way, the pharaohs hoped to protect their bodies and treasures from robbers. This was to confuse grave robbers about which passage to take. Tombs were supposed to be the palaces of pharaohs in the afterlife. Mourners filled the tomb with objects ranging from food to furniture that the mummified pharaoh would need. Such activities included growing and preparing food, caring for animals, and building boats. Only a secret tomb built for a New Kingdom pharaoh was ever found with much of its treasure untouched. The dazzling riches found in this tomb show how much wealth the pharaohs spent preparing for the afterlife. This period of Egyptian history is called the Middle Kingdom."

In [None]:
import itertools
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
MultipartiteRank.__name__

'MultipartiteRank'

In [None]:
from pke.unsupervised import *

outputs = {}
for model in [FirstPhrases, TextRank, SingleRank, TopicRank, PositionRank, MultipartiteRank ]:
    outputs[model.__name__] = []
    if not model.__name__ == 'MultipartiteRank':
      extractor = model()
      extractor.load_document(input=my_txt, language='en')
      extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
      extractor.candidate_weighting()
      if model.__name__ in ["TfIdf"]:
          extractor.candidate_weighting(df=df)
    else:
      extractor = pke.unsupervised.MultipartiteRank()
      extractor.load_document(input=my_txt)
      #    not contain punctuation marks or stopwords as candidates.
      #pos = {'PROPN'}
      pos = {'VERB', 'ADJ', 'NOUN'}
      stoplist = list(string.punctuation)
      stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
      stoplist += stopwords.words('english')
      extractor.candidate_selection(pos=pos)
      # 4. build the Multipartite graph and rank candidates using random walk,
      #    alpha controls the weight adjustment mechanism, see TopicRank for
      #    threshold/method parameters.
      extractor.candidate_weighting(alpha=1.1,
                                    threshold=0.5,
                                    method='average')
    outputs[model.__name__] = [[u for u,v in extractor.get_n_best(n=20)] , [u for u,v in extractor.get_n_best(n=20 , stemming = True)] ]
    


In [None]:
outputs

{'FirstPhrases': [['nile river',
   'egyptian civilization',
   'hundreds',
   'years',
   'equator',
   'africa',
   'mediterranean sea',
   'delta',
   'area',
   'river ’s mouth',
   'water',
   'fine soil',
   'silt',
   'soil',
   'crops',
   'red land',
   'barren desert',
   'fertile region',
   'birds',
   'annual flood waters'],
  ['nile river',
   'egyptian civil',
   'hundr',
   'year',
   'equat',
   'africa',
   'mediterranean sea',
   'delta',
   'area',
   'river ’s mouth',
   'water',
   'fine soil',
   'silt',
   'soil',
   'crop',
   'red land',
   'barren desert',
   'fertil region',
   'bird',
   'annual flood water']],
 'TextRank': [['ancient egyptian dynasties',
   'ancient egyptians',
   'egyptian society men',
   'other ancient african cultures',
   'many ancient societies',
   'many egyptians',
   'egyptian history',
   'egyptian empire',
   'few egyptians',
   'poorer egyptians',
   'egyptian civilization',
   'new kingdom pharaoh',
   'ancient egypt',
   'egy

In [None]:
my_list = []
for i,model in enumerate(outputs):
    if i == 0:
      df = pd.DataFrame(data = {model + 'stem' : outputs[model][1], model : outputs[model][0]} )
    else :
      df = df.join(pd.DataFrame(data = {model + 'stem' : outputs[model][1], model : outputs[model][0]} ))
      

**Data frame of model keyphrases and stemmed keyphrases**

In [None]:
df

Unnamed: 0,FirstPhrasesstem,FirstPhrases,TextRankstem,TextRank,SingleRankstem,SingleRank,TopicRankstem,TopicRank,PositionRankstem,PositionRank,MultipartiteRankstem,MultipartiteRank
0,nile river,nile river,ancient egyptian dynasti,ancient egyptian dynasties,ancient egyptian dynasti,ancient egyptian dynasties,egyptian civil,egyptian civilization,nile river,nile river,tomb,tombs
1,egyptian civil,egyptian civilization,ancient egyptian,ancient egyptians,ancient egyptian,ancient egyptians,ancient egypt,ancient egypt,ancient egyptian dynasti,ancient egyptian dynasties,templ,temples
2,hundr,hundreds,egyptian societi men,egyptian society men,egyptian societi men,egyptian society men,underground tomb,underground tomb,egyptian civil,egyptian civilization,pharaoh,pharaohs
3,year,years,other ancient african cultur,other ancient african cultures,mani egyptian,many egyptians,pharaoh,pharaohs,ancient egyptian,ancient egyptians,tool call,tool called
4,equat,equator,mani ancient societi,many ancient societies,egyptian empir,egyptian empire,priest,priests,river ’s mouth,river ’s mouth,water,water
5,africa,africa,mani egyptian,many egyptians,few egyptian,few egyptians,water,water,egyptian societi men,egyptian society men,ancient paint,ancient painting
6,mediterranean sea,mediterranean sea,egyptian histori,egyptian history,egyptian civil,egyptian civilization,brick hous,brick houses,mani egyptian,many egyptians,cool,cool
7,delta,delta,egyptian empir,egyptian empire,poorer egyptian,poorer egyptians,small pyramid,small pyramid,few egyptian,few egyptians,flour,flour
8,area,area,few egyptian,few egyptians,egyptian histori,egyptian history,peopl,people,egyptian empir,egyptian empire,creat,created
9,river ’s mouth,river ’s mouth,poorer egyptian,poorer egyptians,ancient egypt,ancient egypt,ruler,ruler,poorer egyptian,poorer egyptians,fertil,fertile
