# Rapid Automatic Keyword Extraction (RAKE)

In [94]:
import pandas as pd

from nltk.corpus import stopwords
from rake_nltk import Rake
from fuzzywuzzy import process
import string

## Dataset

In [74]:
# Reading the data 
dataset_csv = "ICMLA_2014_2015_2016_2017.csv"
encoding = "ISO-8859-1"
data_df = pd.read_csv(dataset_csv, encoding=encoding).set_index("paper_id")
data_df.head()

Unnamed: 0_level_0,title,keywords,abstract,session,year
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Ensemble Statistical and Heuristic Models for ...,"statistical word alignment, ensemble learning,...",Statistical word alignment models need large a...,Ensemble Methods,2014
2,Improving Spectral Learning by Using Multiple ...,"representation, spectral learning, discrete fo...",Spectral learning algorithms learn an unknown ...,Ensemble Methods,2014
3,Applying Swarm Ensemble Clustering Technique f...,"software defect prediction, particle swarm opt...",Number of defects remaining in a system provid...,Ensemble Methods,2014
4,Reducing the Effects of Detrimental Instances,"filtering, label noise, instance weighting",Not all instances in a data set are equally be...,Ensemble Methods,2014
5,Concept Drift Awareness in Twitter Streams,"twitter, adaptation models, time-frequency ana...",Learning in non-stationary environments is not...,Ensemble Methods,2014


## RAKE Example

In [119]:
data_df["text"] = data_df["title"] + " " + data_df["abstract"]
corpus = data_df["text"].values

In [116]:
punctuations = [punc for punc in string.punctuation]
punctuations.extend(['*.','--',').','),','?,'])
print(punctuations)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '*.', '--', ').', '),', '?,']


In [106]:
def extract_keywords_rake(text):
    r = Rake(
        stopwords= set(stopwords.words("english")),
        punctuations=punctuations,
        include_repeated_phrases=False,
        min_length=1,
        max_length=3)
    r.extract_keywords_from_text(text)
    doc_keywords = r.get_ranked_phrases()
    # Reducing duplication in keywords
    deduplicated_doc_keywords = list(process.dedupe(doc_keywords, threshold=70))
    final_keywords = ", ".join(deduplicated_doc_keywords[:6])
    return final_keywords

In [107]:
# Applying RAKE to whole dataset
data_df["extracted_keywords"] = data_df["text"].apply(extract_keywords_rake)
data_df["extracted_keywords"]

paper_id
1      experimental results show, heuristic method ba...
2      g ., fourier, empirical results suggest, compa...
3      using software metrics, empirical study shows,...
4      significant positive impact, results also sugg...
5      dynamic nature tends, social networks gained, ...
                             ...                        
444    great testing ground, great intellectual chall...
445    resource requests measurements, describe two a...
446    female samples separately, dominant behavior e...
447    support vector machines, statistical measures ...
448    report mild symptoms, reducing psychological d...
Name: extracted_keywords, Length: 448, dtype: object

In [108]:
data_df["extracted_keywords"].iloc[2]

'using software metrics, empirical study shows, data mining techniques, different clustering algorithms, single clustering solution, clustering solutions'

In [118]:
data_df.to_csv("rake_keywords.csv")