This notebook was ran on a Google Cloud Workbench jupyter notebook instance with a Free Tier n2-highmem-8 (8 vCPUs, 64GB RAM) machine.

## Table of Contents
* [pke](#pke)
* [keyVD](#keyVD)
* [computation times](#timeit)

### Note

We will predict keywords from all our models, our benchmark is made using https://github.com/boudinfl/pke/

The prediction cells can be re-run if, for any reason, it failed; it will continue from where it left off.

In [1]:
%cd /home/jupyter
!pip install -r requirements.txt
%cd /home/jupyter/notebooks

/home/jupyter
/home/jupyter/notebooks


In [2]:
import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger().setLevel(logging.ERROR)

In [3]:
from google.cloud import bigquery
import pandas
import json
import time

with open('/home/jupyter/setup.json', 'r') as file :
    variables = json.load(file)

if variables["USE_GCP"] != "false":
    bq_client = bigquery.Client(
        project = variables['PROJECT_ID'], 
        location = variables['REGION']
    )

In [4]:
try:
    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus import stopwords
    from nltk import ngrams

    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    remove_words = stopwords.words()
except:
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')

    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus import stopwords
    from nltk import ngrams

    stemmer = PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    remove_words = stopwords.words()
tops = [5, 10, 20]

In [5]:
# To make append work, we need to have at least the header columns in a file, we'll create it
if variables["USE_GCP"] == "false":
    try:
        results = pandas.read_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",")
    except:
        result = {"model_id" : [''], "dataset_id" : [''], "text_id" : ['']}
        for top in tops:
            result.update({f"predictions_{top}" : ['']})
        pandas.DataFrame(result).to_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",", index=False, header=True)
else:
    try:    
        bq_client.query(f"""
            SELECT * FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_INPUT_ID']}`
            WHERE CONCAT(dataset_id, text_id) NOT IN 
            (SELECT CONCAT(dataset_id, text_id) FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}` WHERE model_id = '{model_id}')
        """).to_dataframe()
    except:
        result = {"model_id" : [''], "dataset_id" : [''], "text_id" : ['']}
        for top in tops:
            result.update({f"predictions_{top}" : ['']})
        job = bq_client.load_table_from_dataframe(
            dataframe = pandas.DataFrame(result), 
            destination = f"{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}"
        )
        job.result()

# pke<a class="anchor" id="pke"></a>

To compare our model we'll use the [Python Keyphrase Extraction open source](https://github.com/boudinfl/pke) toolkit.

In [6]:
!pip install git+https://github.com/boudinfl/pke.git
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /var/tmp/pip-req-build-oqf1_0yx
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /var/tmp/pip-req-build-oqf1_0yx
  Resolved https://github.com/boudinfl/pke.git to commit 69871ffdb720b83df23684fea53ec8776fd87e63
  Preparing metadata (setup.py) ... [?25ldone
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0-py3-none-any.whl (1

In [7]:
from pke.unsupervised import *
import spacy
models = [MultipartiteRank, TfIdf, YAKE, TopicRank]

In [None]:
results = pandas.DataFrame()

for model in models :
    # We only take the remaining inputs to predict for each model.
    model_id = str(model).split('\'')[-2].split('.')[-1]
    if variables["USE_GCP"] != "false":    
        documents = bq_client.query(f"""
            SELECT * FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_INPUT_ID']}`
            WHERE CONCAT(dataset_id, text_id) NOT IN 
            (SELECT CONCAT(dataset_id, text_id) FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}` WHERE CAST(model_id AS STRING) = '{model_id}')
        """).to_dataframe()
    else:
        results = pandas.read_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",")
        results = results[results["model_id"] == model_id]
        documents = pandas.read_csv(f"{variables['TABLE_INPUT_ID']}.csv", sep=",")
        index_remove = documents.merge(results, left_on=["dataset_id", "text_id"], right_on=["dataset_id", "text_id"], how="left")
        index_remove = index_remove["model_id"].isna()
        documents = documents[index_remove]
    results = pandas.DataFrame()
    
    for index in list(documents.index) :
        dataset_id = documents.loc[index, 'dataset_id']
        text_id = documents.loc[index, 'text_id']
        text_input = documents.loc[index, 'input']
        
        if dataset_id == 'termith-eval':
            spacy_model = spacy.load("fr_core_news_sm")
            language = "fr"
        else:
            spacy_model = spacy.load("en_core_web_sm")
            language = "en"

        extractor = model()
        extractor.load_document(input=text_input, language=language, spacy_model=spacy_model)

        result = pandas.DataFrame({'model_id' : [model_id], 'dataset_id' : [dataset_id], 'text_id' : [text_id]})
        for top in tops :
            extractor.candidate_selection()
            extractor.candidate_weighting()
            predictions = extractor.get_n_best(n=top)
            result[f"predictions_{top}"] = ';'.join([x[0] for x in set(sorted(predictions, key = lambda x : x[1], reverse = True))])     
        
        results = pandas.concat([results, result])
        
        # We take snapshots every 100 inputs or when the model has finished predicting all inputs.
        if index%100 == 0 or index == list(documents.index)[-1]:
            # Print current status of the pipeline, uncomment if interested
            #print(f'model_id: {model_id}\ninput: {dataset_id}_{text_id}\nrows left: {len(documents) - index}')
            if variables['USE_GCP'] != "false":
                job = bq_client.load_table_from_dataframe(
                    dataframe = results, 
                    destination = f"{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}"
                )
                job.result()
            else:
                results.to_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",", mode='a', index=False, header=False)
            results = pandas.DataFrame()

# keyVD<a class="anchor" id="keyVD"></a>

In [None]:
%cd /home/jupyter/
from keyVD import *
%cd /home/jupyter/notebooks

In [None]:
model_id = 'keyVD'

if variables["USE_GCP"] != "false":    
    documents = bq_client.query(f"""
        SELECT * FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_INPUT_ID']}`
        WHERE CONCAT(dataset_id, text_id) NOT IN 
        (SELECT CONCAT(dataset_id, text_id) FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}` WHERE model_id = '{model_id}')
    """).to_dataframe()
else:
    results = pandas.read_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",")
    results = results[results["model_id"].apply(lambda x : model_id in str(x))]
    documents = pandas.read_csv(f"{variables['TABLE_INPUT_ID']}.csv", sep=",")
    index_remove = documents.merge(results, left_on=["dataset_id", "text_id"], right_on=["dataset_id", "text_id"], how="left")
    index_remove = index_remove["model_id"].isna()
    documents = documents[index_remove]
    results = pandas.DataFrame()

vocabularies = {}
for subset_id in ['author', 'reader', 'controlled', 'uncontrolled', 'indexer']:
    if variables["USE_GCP"] != "false":
        vocabulary = bq_client.query(f"""
        SELECT * FROM `{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_OUTPUT_ID']}`'
    """).to_dataframe()
    else:
        vocabulary = pandas.read_csv(f"{variables['TABLE_OUTPUT_ID']}.csv", sep=",")
    vocabulary = vocabulary[(vocabulary["subset_id"] == subset_id)]
    vocabularies[subset_id] = ";".join(list(vocabulary.output.values))

for index in list(documents.index) :
    dataset_id = documents.loc[index, 'dataset_id']
    text_id = documents.loc[index, 'text_id']
    text_input = documents.loc[index, 'input']
    if dataset_id == 'semeval':
        subsets = ['author', 'reader']
    elif dataset_id == 'inspec':
        subsets = ['controlled', 'uncontrolled']
    elif dataset_id == 'termith-eval':
        subsets = ['indexer']
    for subset_id in subsets:
        vocabulary = vocabularies[subset_id]

        generator = KeyVD()
        generator.load_vocabulary(vocabulary=vocabulary)
        generator.load_text(text=text_input)

        result = pandas.DataFrame({'model_id' : [model_id + '_' + subset_id], 'dataset_id' : [dataset_id], 'text_id' : [text_id]})
        for top in tops :
            predictions = generator.keywords_generation(n_keys=top)
            result[f"predictions_{top}"] = ';'.join(predictions)

        results = pandas.concat([results, result])

    # We take snapshots every 100 inputs or when the model has finished predicting all inputs.
    if index%100 == 0 or index == list(documents.index)[-1]:
        # Print current status of the pipeline, uncomment if interested
        #print(f'model_id: {model_id}_{subset_id}\ninput: {dataset_id}_{text_id}\n% done: {100 * index/len(documents)}')
        if variables['USE_GCP'] != "false":
            job = bq_client.load_table_from_dataframe(
                dataframe = results, 
                destination = f"{variables['PROJECT_ID']}.{variables['DATASET_ID']}.{variables['TABLE_PREDICTIONS_ID']}"
            )
            job.result()
        else:
            results.to_csv(f"{variables['TABLE_PREDICTIONS_ID']}.csv", sep=",", mode='a', index=False, header=False)
        results = pandas.DataFrame()

# computation times<a class="anchor" id="timeit"></a>

In [76]:
%cd /home/jupyter/notebooks
documents = pandas.read_csv(f"{variables['TABLE_INPUT_ID']}.csv", sep=",")
documents[documents['dataset_id'] == 'inspec']
vocabulary = vocabularies['controlled']
spacy_model = spacy.load("en_core_web_sm")
language = "en"
top = 5
n = 0

/home/jupyter/notebooks


In [77]:
%%timeit -n 1 -r 10
numpy.random.seed(n)
index = numpy.random.randint(0, len(documents))
text_input = documents.loc[index, 'input']

generator = KeyVD()
generator.load_vocabulary(vocabulary=vocabulary)
generator.load_text(text=text_input)
predictions = generator.keywords_generation(n_keys=top)

202 ms ± 23.6 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [78]:
%%timeit -n 1 -r 10
numpy.random.seed(n)
index = numpy.random.randint(0, len(documents))
text_input = documents.loc[index, 'input']

extractor = TopicRank()
extractor.load_document(input=text_input, language=language, spacy_model=spacy_model)
extractor.candidate_selection()
extractor.candidate_weighting()
predictions = extractor.get_n_best(n=top)

135 ms ± 21.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [79]:
%%timeit -n 1 -r 10
numpy.random.seed(n)
index = numpy.random.randint(0, len(documents))
text_input = documents.loc[index, 'input']

extractor = YAKE()
extractor.load_document(input=text_input, language=language, spacy_model=spacy_model)
extractor.candidate_selection()
extractor.candidate_weighting()
predictions = extractor.get_n_best(n=top)

134 ms ± 1.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [80]:
%%timeit -n 1 -r 10
numpy.random.seed(n)
index = numpy.random.randint(0, len(documents))
text_input = documents.loc[index, 'input']

extractor = MultipartiteRank()
extractor.load_document(input=text_input, language=language, spacy_model=spacy_model)
extractor.candidate_selection()
extractor.candidate_weighting()
predictions = extractor.get_n_best(n=top)

129 ms ± 1.39 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [81]:
%%timeit -n 1 -r 10
numpy.random.seed(n)
index = numpy.random.randint(0, len(documents))
text_input = documents.loc[index, 'input']

extractor = TfIdf()
extractor.load_document(input=text_input, language=language, spacy_model=spacy_model)
extractor.candidate_selection()
extractor.candidate_weighting()
predictions = extractor.get_n_best(n=top)

1.31 s ± 10.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
