## Possible keyword extraction algorithms

__Shown keyword extraction algorithms:__
- __Graph based:__
    - TopicRank
    - PositionRank
    - SingleRank
- __Statistical:__
    - Yake

In [2]:
import pke  # Package containing several keyword extractors
import os 
import string 
from nltk.corpus import stopwords

In [5]:
# create data files
def create_data_files(major_dir: str):
    all_together = []
    for sub_dir in os.listdir(major_dir):
        sub_dir_text = ""
        sub_dir_path = os.path.join(major_dir, sub_dir)
        if os.path.isdir(sub_dir_path):
            for file in [x for x in os.listdir(sub_dir_path) if x.endswith(".txt")]:
                sub_dir_text += open(os.path.join(sub_dir_path, file), "r").read() + "\n"
        all_together.append(sub_dir_text)
        out = open(os.path.join(major_dir, sub_dir+".txt"), "w")
        out.write(sub_dir_text)
        out.close()
    all_out = open(os.path.join(major_dir, "all.txt"), "w")
    all_out.write("\n".join(all_together))
    all_out.close()
    
# create_data_files(major_dir="../data/datasets/")

In [6]:
clustering_data = open("../data/datasets/clustering.txt", "r").read()
prediction_data = open("../data/datasets/prediction.txt", "r").read()
pattern_data = open("../data/datasets/frequent_pattern_mining.txt", "r").read()

### Topic Rank
__Keyword Candidates:__ <br/>
- Sequences of Nouns and adjectives
- Division in topics -> Overlap between candidates are used for topic creation with Hierarchical Agglomerative Clustering

__Scoring:__<br/>
- TextRank Scoring 

__Final selection:__
- Multiple possibilities [First in Topic, Frequency of candidate, Centroid]

In [5]:
topic_rank = pke.unsupervised.TopicRank()
topic_rank.load_document(input='../data/datasets/merged_files/clustering.txt', language="en")
pos = {'NOUN', 'PROPN', 'ADJ'}
stoplist = list(string.punctuation)
stoplist += stopwords.words('english')

topic_rank.candidate_selection(pos=pos, stoplist=stoplist)
topic_rank.candidate_weighting(threshold=0.74, method='average', heuristic="frequent")

topic_rank_keyphrases = topic_rank.get_n_best(n=20)
for k in topic_rank_keyphrases:
    print(k)

('abstract attribute clustering', 0.07264453569892483)
('abstract highdimensional data streams', 0.02765963626368808)
('approach similar', 0.02128196996686105)
('appropriate groups', 0.018928294256216952)
('abstract object set', 0.018589782988967022)
('abstract cluster analysis', 0.01681287419785202)
('abstract privacy preserving data mining', 0.013131806998692127)
('different parameter setting', 0.011291084984061575)
('account possible differences', 0.011034804679302686)
('introduction', 0.009827086386374377)
('certain number', 0.00865657843214086)
('classical pattern recognition problem', 0.007278997676732894)
('abstract unsupervised learning', 0.007163383361746941)
('arbitrary process', 0.007151409611048128)
('analysis data clustering', 0.0071386361384713545)
('betweengroup dissimilarity', 0.006991440551768367)
('abstract hierarchical clustering', 0.006574590709327781)
('fundamental machine learning', 0.006441595077420247)
('algorithm', 0.006397327735959678)
('data noise', 0.0063257

### PositionRank
__Scoring:__<br/>
Uses PageRank system, while including the positioning of the keyword in the sentence in the weightening schema.<br/>
The earlier the term is occurring, the more relevant it seems to be. <br/><br/>
This will be averaged over all occurrences of the keyword with: <br/><br/>
For a word occurring on the $ 2^{nd},\:5^{th} $ and $ 10^{th} $ position in the text: <br/>
$$ W_{Position} = \frac{1}{2}+\frac{1}{5}+\frac{1}{10} = 0.8 $$

In [6]:
position_rank = pke.unsupervised.PositionRank()
position_rank.load_document(input='../data/datasets/merged_files/clustering.txt', language="en", normalization=None)
position_rank.candidate_selection(grammar=None, maximum_word_number=1)
position_rank.candidate_weighting(window=5, pos=pos, normalized=True)
position_rank_keyphrases = position_rank.get_n_best(n=20)
for k in position_rank_keyphrases: 
    print(k)

('clustering', 0.05415698811242924)
('data', 0.04531772410423927)
('cluster', 0.025506177473485103)
('analysis', 0.02208116875796451)
('clusters', 0.01720518069564107)
('similarity', 0.012921219980310828)
('study', 0.012123983623622986)
('objects', 0.009875554685460523)
('hierarchical', 0.008468824608984734)
('groups', 0.008409468248675127)
('measure', 0.007498115027469089)
('introduction', 0.00731539003028165)
('learning', 0.0072341393293188895)
('method', 0.006577973529407398)
('technique', 0.0062962157904784015)
('number', 0.005802020729141968)
('pattern', 0.005786295272576257)
('distance', 0.005358456819104482)
('process', 0.005299947407788508)
('image', 0.004856405066303605)


### YAKE
__Scoring:__<br/>
Five different measures used for term weightening (The smaller the overall score the more relevant the keyword):<br/>
<ol>
    <li><b>Casing:</b><br/>...Weights words with only upper case letters or starting with upper case letters more</li>
    <li><b>Word position:</b><br/>...Words occurring in the first words of a document/sentence are assumed to be more relevant</li>
    <li><b>Word frequency:</b><br/>...Higher frequent words, higher than the mean frequency, incl. standard deviation, are considered more relevant</li>
    <li><b>Word relatedness to context:</b><br/>...Evaluates how strong the term can be seen as a stopword</li>
    <li><b>Word DifSentence:</b><br/>...Measures the relative sentence occurrence of terms</li>
</ol>
Used formulars:
<ol>
    <li>$ W_{Case} = \frac{max(TF(U(w)), TF(A(w)))}{log_2(TF(w))} $</li>
    <li>$ W_{Position} = log_2(log_2(2+Median(Sen_w))) $</li>
    <li>$ W_{Freq} = \frac{TF(w)}{MeanTF+1*\sigma} $</li>
    <li>$ W_{Rel} = \Big(0.5+\Big(\Big(WL*\frac{TF(w)}{MaxTf}\Big)+PL\Big)\Big)+                 \Big(0.5+\Big(\Big(WR*\frac{TF(w)}{MaxTf}\Big)+PR\Big)\Big) $</li>
    <li>$ W_{DifSentence} = \frac{SF(w)}{\#_{Sentences}} $</li>
</ol>

__Resulting final keyword score:__ <br/>

$$ S(kw) = \frac{\prod_{w\:\in\:kw}S(w)}{TF(kw)*(1+\sum_{w\:\in\:kw}S(w))}$$

In [19]:
yake = pke.unsupervised.YAKE()

yake.load_document(input='../data/datasets/clustering.txt', language="en", normalization=None)

stoplist = stopwords.words('english')
yake.candidate_selection(n=1, stoplist=stoplist)

yake.candidate_weighting(window=5, stoplist=stoplist, use_stems=False)

threshold = 0.8
yake_keyphrases = yake.get_n_best(n=20, threshold=threshold)

for k in yake_keyphrases: 
    print(k)

('clustering', 5.2339059914055286e-05)
('data', 8.551312067507064e-05)
('cluster', 0.0002047752055714894)
('analysis', 0.0002816288755117967)
('objects', 0.0005342390488141243)
('set', 0.0007304004780160237)
('groups', 0.0008261452005597773)
('different', 0.0008509194617797423)
('similar', 0.0009085692575871795)
('introduction', 0.0009514261873288945)
('unsupervised', 0.0009619977156120488)
('one', 0.0009743913004332281)
('learning', 0.0010554209851798351)
('mining', 0.0010890941142609669)
('used', 0.0011428508306604828)
('technique', 0.0012973797216807483)
('important', 0.0013195233469502265)
('similarity', 0.0013518593699869592)
('method', 0.0015462505076103267)
('number', 0.0018750698777329211)


### SingleRank

In [10]:
single_rank = pke.unsupervised.SingleRank()
single_rank.load_document(input='../data/datasets/clustering.txt', language="en", normalization=None)

single_rank.candidate_selection(pos=pos)

single_rank.candidate_weighting(window=4, pos=pos, normalized=True)

single_rank_keyphrases = single_rank.get_n_best(n=20)
for k in single_rank_keyphrases:
    print(k)

('clustering clustering', 0.0417691440080825)
('clustering', 0.041542364008082505)
('data clustering', 0.03848422204952038)
('data', 0.03541291009095827)
('analysis data clustering', 0.03187077387674159)
('clustering technique clustering', 0.029634409758496426)
('clustering algorithm clustering', 0.02957903880865547)
('cluster analysis data clustering', 0.02894413973752938)
('statistical clustering clustering', 0.02871830232981528)
('soft clustering clustering', 0.028481260829131248)
('clustering overview clustering', 0.02826530338314274)
('unsupervised data clustering', 0.028171486610169192)
('method data clustering', 0.027908073749125382)
('clustering paradigm clustering', 0.0278543916333915)
('introduction data clustering', 0.02742781223923391)
('abstract data clustering', 0.027181573275699374)
('algorithm data clustering', 0.027147170836280723)
('multidimensional data clustering', 0.02661355081439924)
('background data clustering', 0.026436633738617234)
('outlier data clustering', 

### TopicalPageRank

In [15]:
tpr = pke.unsupervised.TopicalPageRank()

tpr.load_document(input='../data/datasets/all.txt', language='en')
tpr.candidate_selection(grammar=None)
tpr.candidate_weighting(window=4, pos=pos, lda_model='model_trainings/LDA/LDA_for_key_extr.gz')

tpr_keyphrases = tpr.get_n_best(n=20)
for k in tpr_keyphrases:
    print(k)

TypeError: 'LatentDirichletAllocation' object is not iterable