# Keyword extraction using sentence embeddings

**Source:** https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea 

## Imports

In [1]:
import sys
import json
import numpy as np
import itertools
import yake

In [2]:
sys.path.append("../web_serv/")
from embed_picker import *
from keyword_extractor import *

## Utils

In [14]:
def file_to_json(file_name:str, extension:str, field_name:str):
    """
    Convert a text file to a json array by reading in each line of the file `file_name` and 
    turning it into a json object with a single field named `field_name`. 
    
    The final output is written to a file with the same name as `file_name` but with the `.json` extension.
    """
    with open(file_name + extension, 'r') as f:
        lines = f.readlines()
        objs = [{field_name: line.strip()} for line in lines]
        with open(file_name + '.json', 'w', encoding="utf-8") as out:
            json.dump(objs, out, ensure_ascii=False)

In [15]:
#file_to_json("../data/wiktionary_mathematics_keywords", ".txt", "keyword")

In [16]:
'''
with open("../data/wiktionary_mathematics_keywords.txt", 'r') as f:
    kwds = f.readlines()
    kwds_ = [*set(kwds)]
    
with open("../data/wiktionary_mathematics_keywords.txt", 'w') as f:
    f.writelines(kwds_)   
'''

'\nwith open("../data/wiktionary_mathematics_keywords.txt", \'r\') as f:\n    kwds = f.readlines()\n    kwds_ = [*set(kwds)]\n    \nwith open("../data/wiktionary_mathematics_keywords.txt", \'w\') as f:\n    f.writelines(kwds_)   \n'

In [17]:
#save_embeddings("../data/wiktionary_mathematics_keywords.json", "keyword", model)

## Parameters

In [18]:
model = "all-MiniLM-L6-v2"

In [19]:
embs, kwds = load_embeddings("../data/wiktionary_mathematics_keywords.json", "keyword", model)

In [20]:
n_top = 5

In [21]:
n_candidate = 10

## Experiments

In [22]:
statement = "Every "

In [23]:
closest_embeddings(statement, model, embs, kwds, n_top)

[{'keyword': 'simple_group'},
 {'keyword': 'cyclic_group'},
 {'keyword': 'group_theory'},
 {'keyword': 'subgroup'},
 {'keyword': 'group'}]

In [24]:
spread_similar_embeddings(statement, model, embs, kwds, n_top, n_candidate)

Candidate keywords:  [{'keyword': 'trivial_group'}, {'keyword': 'normal_subgroup'}, {'keyword': 'cogroup'}, {'keyword': 'group_object'}, {'keyword': 'group_ring'}, {'keyword': 'simple_group'}, {'keyword': 'cyclic_group'}, {'keyword': 'group_theory'}, {'keyword': 'subgroup'}, {'keyword': 'group'}]
Candidate keyword scores:  [0.5301939, 0.5374895, 0.5400141, 0.54497665, 0.546558, 0.57228845, 0.59932315, 0.6401416, 0.66509676, 0.6800729]
Selected keywords:  [{'keyword': 'group'}, {'keyword': 'normal_subgroup'}, {'keyword': 'cyclic_group'}, {'keyword': 'group_object'}, {'keyword': 'cogroup'}]
Selected keyword scores:  [0.6800729, 0.5374895, 0.59932315, 0.54497665, 0.5400141]


[{'keyword': 'group'},
 {'keyword': 'normal_subgroup'},
 {'keyword': 'cyclic_group'},
 {'keyword': 'group_object'},
 {'keyword': 'cogroup'}]

## More experiments

In [27]:
np.dot(models[model].encode("finite"), models[model].encode("finiteness"))

0.87796074

In [28]:
np.dot(models[model].encode("finite"), models[model].encode("finitely"))

0.9015876

In [29]:
np.dot(models[model].encode("finite"), models[model].encode("finite sum"))

0.53661925

In [30]:
np.dot(models[model].encode("finite"), models[model].encode("finite group"))

0.57394695