# Keyword extraction using sentence embeddings

**Source:** https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea 

## Imports

In [1]:
import sys
import json
import numpy as np
import itertools

In [2]:
sys.path.append("../web_serv/")
from embed_picker import *

[{'keyword': 'left_ideal'}, {'keyword': 'noetherian'}, {'keyword': 'noetherian'}, {'keyword': 'polynomial_ring'}, {'keyword': 'Noetherian_ring'}]
[{'keyword': 'Noetherian_ring'}, {'keyword': 'Category:en:Polynomials'}, {'keyword': 'Alexander_polynomial'}, {'keyword': 'two-sided_ideal'}, {'keyword': 'ring_homomorphism'}]


## Utils

In [3]:
def file_to_json(file_name:str, extension:str, field_name:str):
    """
    Convert a text file to a json array by reading in each line of the file `file_name` and 
    turning it into a json object with a single field named `field_name`. 
    
    The final output is written to a file with the same name as `file_name` but with the `.json` extension.
    """
    with open(file_name + extension, 'r') as f:
        lines = f.readlines()
        objs = [{field_name: line.strip()} for line in lines]
        with open(file_name + '.json', 'w') as out:
            json.dump(objs, out)

In [4]:
# file_to_json("../data/wiktionary_mathematics_keywords", ".txt", "keyword")

In [5]:
# save_embeddings("../data/wiktionary_mathematics_keywords.json", "keyword", model)

## Parameters

In [7]:
model = "all-MiniLM-L6-v2"

In [8]:
embs, kwds = load_embeddings("../data/wiktionary_mathematics_keywords.json", "keyword", model)

In [9]:
n_top = 5

In [14]:
n_candidate = 10

## Experiments

In [30]:
statement = "Every convergent sequence is a Cauchy sequence, and the converse is true for real numbers, and this means that the topological space of the real numbers is complete."

In [31]:
closest_embeddings(statement, model, embs, kwds, n_top)

[{'keyword': 'Cauchy_sequence'},
 {'keyword': 'topologicality'},
 {'keyword': 'Cauchy_space'},
 {'keyword': 'final_topology'},
 {'keyword': 'completeness_axiom'}]

In [32]:
spread_similar_embeddings(statement, model, embs, kwds, n_top, n_candidate)

[{'keyword': 'completeness_axiom'},
 {'keyword': 'topological'},
 {'keyword': 'Cauchy_sequence'},
 {'keyword': 'complete_measure'},
 {'keyword': 'topologicality'}]