# Yet Another Keyword Extractor

In [11]:
import pandas as pd

from nltk.corpus import stopwords
import yake
from fuzzywuzzy import process
import string

## Dataset

In [12]:
# Reading the data 
dataset_csv = "ICMLA_2014_2015_2016_2017.csv"
encoding = "ISO-8859-1"
data_df = pd.read_csv(dataset_csv, encoding=encoding).set_index("paper_id")
data_df.head()

Unnamed: 0_level_0,title,keywords,abstract,session,year
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Ensemble Statistical and Heuristic Models for ...,"statistical word alignment, ensemble learning,...",Statistical word alignment models need large a...,Ensemble Methods,2014
2,Improving Spectral Learning by Using Multiple ...,"representation, spectral learning, discrete fo...",Spectral learning algorithms learn an unknown ...,Ensemble Methods,2014
3,Applying Swarm Ensemble Clustering Technique f...,"software defect prediction, particle swarm opt...",Number of defects remaining in a system provid...,Ensemble Methods,2014
4,Reducing the Effects of Detrimental Instances,"filtering, label noise, instance weighting",Not all instances in a data set are equally be...,Ensemble Methods,2014
5,Concept Drift Awareness in Twitter Streams,"twitter, adaptation models, time-frequency ana...",Learning in non-stationary environments is not...,Ensemble Methods,2014


In [13]:
data_df["text"] = data_df["title"] + " " + data_df["abstract"]
corpus = data_df["text"].values

In [20]:
# Applying pre_process to single example text
title = data_df["title"].iloc[0]
abstract = data_df["abstract"].iloc[0]
text = f"{title} {abstract}"

In [57]:
def extract_keywords_yake(text):
    deduplication_threshold = 0.7
    deduplication_algo = 'seqm'
    numOfKeywords = 20
    y = yake.KeywordExtractor( 
        n=3, # maximum ngram size
        dedupLim=0.7, # deduplication threshold
        dedupFunc='seqm', # deduplication algorithm
        top=numOfKeywords, 
        features=None)
    doc_keywords = [keyword[0] for keyword in y.extract_keywords(text)][::-1]
    deduplicated_doc_keywords = list(process.dedupe(doc_keywords, threshold=70))
    final_keywords = ", ".join(deduplicated_doc_keywords)
    return final_keywords

In [58]:
extract_keywords_yake(text)

'Unsupervised Word Alignment, word alignment models, Statistical and Heuristic, small-size corpora, large amount, Alignment Statistical word'

In [59]:
data_df["extracted_keywords"] = data_df["text"].apply(extract_keywords_yake)
data_df["extracted_keywords"]

paper_id
1      Unsupervised Word Alignment, word alignment mo...
2      Spectral learning algorithms, Improving Spectr...
3      Number of defects, Ensemble Clustering Techniq...
4      weighting detrimental instances, RDIL, learnin...
5      ability, Twitter Streams Learning, Twitter mes...
                             ...                        
444    Supporting Advanced Knowledge, Machine Learnin...
445    resource requests measurements, applications.W...
446    apply cluster analysis, challenging behaviors ...
447    high impact mHealth, cost and high, low cost, ...
448    Meaning Centered Psychotherapy, Bayesian Nonpa...
Name: extracted_keywords, Length: 448, dtype: object

In [60]:
data_df["extracted_keywords"].iloc[2]

'Number of defects, Ensemble Clustering Technique, defect prediction software, Software Metrics Number, data mining techniques, Technique for Fault, Particle Swarm Optimization, Applying Swarm Ensemble, Swarm Ensemble Clustering'

In [None]:
data_df.to_csv("yake_keywords.csv")