# Rapid Automatic Keyword Extraction (RAKE)

In [None]:
import pandas as pd

from nltk.corpus import stopwords
from rake_nltk import Rake
from fuzzywuzzy import process
import string

: 

## Dataset

In [None]:
# Reading the data 
dataset_csv = "ICMLA_2014_2015_2016_2017.csv"
encoding = "ISO-8859-1"
data_df = pd.read_csv(dataset_csv, encoding=encoding).set_index("paper_id")
data_df.head()

: 

## RAKE Example

In [None]:
data_df["text"] = data_df["title"] + " " + data_df["abstract"]
corpus = data_df["text"].values

: 

In [None]:
punctuations = [punc for punc in string.punctuation]
punctuations.extend(['*.','--',').','),','?,'])
print(punctuations)

: 

In [None]:
def extract_keywords_rake(text):
    r = Rake(
        stopwords= set(stopwords.words("english")),
        punctuations=punctuations,
        include_repeated_phrases=False,
        min_length=1,
        max_length=3)
    r.extract_keywords_from_text(text)
    doc_keywords = r.get_ranked_phrases()
    # Reducing duplication in keywords
    deduplicated_doc_keywords = list(process.dedupe(doc_keywords, threshold=70))
    final_keywords = ", ".join(deduplicated_doc_keywords[:6])
    return final_keywords

: 

In [None]:
# Applying RAKE to whole dataset
data_df["extracted_keywords"] = data_df["text"].apply(extract_keywords_rake)
data_df["extracted_keywords"]

: 

In [None]:
data_df["extracted_keywords"].iloc[2]

: 

In [None]:
data_df.to_csv("rake_keywords.csv")

: 