# Keyword Extraction using TF-IDF

In [236]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import process

## Dataset

In [237]:
# Reading the data 
dataset_csv = "ICMLA_2014_2015_2016_2017.csv"
encoding = "ISO-8859-1"
data_df = pd.read_csv(dataset_csv, encoding=encoding).set_index("paper_id")
data_df.head()

Unnamed: 0_level_0,title,keywords,abstract,session,year
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Ensemble Statistical and Heuristic Models for ...,"statistical word alignment, ensemble learning,...",Statistical word alignment models need large a...,Ensemble Methods,2014
2,Improving Spectral Learning by Using Multiple ...,"representation, spectral learning, discrete fo...",Spectral learning algorithms learn an unknown ...,Ensemble Methods,2014
3,Applying Swarm Ensemble Clustering Technique f...,"software defect prediction, particle swarm opt...",Number of defects remaining in a system provid...,Ensemble Methods,2014
4,Reducing the Effects of Detrimental Instances,"filtering, label noise, instance weighting",Not all instances in a data set are equally be...,Ensemble Methods,2014
5,Concept Drift Awareness in Twitter Streams,"twitter, adaptation models, time-frequency ana...",Learning in non-stationary environments is not...,Ensemble Methods,2014


## Data Pre-processing (Data Cleaning)

In [238]:
def pre_process(text, stop_words=set(stopwords.words("english"))):
    # remove punctuations and digits 
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if not word in stop_words]
    return" ".join(text)

In [239]:
stop_words = set(stopwords.words("english"))
print(stopwords)

<WordListCorpusReader in 'C:\\Users\\Rohit Garud\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>


In [240]:
# Applying pre_process to single example text
title = data_df["title"].iloc[0]
abstract = data_df["abstract"].iloc[0]
text = f"{title} {abstract}"
print(text)
print("================================")
cleaned_text = pre_process(text, stop_words)
print(cleaned_text)

Ensemble Statistical and Heuristic Models for Unsupervised Word Alignment Statistical word alignment models need large amount of training data while they are weak in small-size corpora. This paper proposes a new approach of unsupervised hybrid word alignment technique using ensemble learning method. This algorithm uses three base alignment models in several rounds to generate alignments. The ensemble algorithm uses a weighed scheme for resampling training data and a voting score to consider aggregated alignments. The underlying alignment algorithms used in this study include IBM Model 1, 2 and a heuristic method based on Dice measurement. Our experimental results show that by this approach, the alignment error rate could be improved by at least %15 for the base alignment models.
ensemble statistical heuristic models unsupervised word alignment statistical word alignment models need large amount training data weak small size corpora paper proposes new approach unsupervised hybrid word a

In [241]:
# Applying preprocessing to entire dataset
data_df["text"] = data_df["title"] + " " + data_df["abstract"]
data_df["text"] = data_df["text"].apply(pre_process)
corpus = data_df["text"].values

## TF-IDF Keyword Extraction Example

In [242]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.8,
    min_df=1,
    ngram_range=(1,3)
)
# Fit and transform the text
tfidf = vectorizer.fit_transform(corpus)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

tfidf.shape

(448, 83923)

In [244]:
author_keywords = data_df["keywords"].iloc[0]
author_keywords

'statistical word alignment, ensemble learning, heuristic word alignment'

In [245]:
keyword_list = feature_names[np.argsort(tfidf.toarray()[0])[-10:][::-1]]
keyword_list

array(['alignment', 'word alignment', 'alignment models', 'word',
       'base alignment', 'base alignment models', 'alignments',
       'ensemble', 'algorithm uses', 'heuristic'], dtype=object)

In [None]:
[", ".join(feature_names[np.argsort(doc_tfidf)[-10:][::-1]].tolist()) for doc_tfidf in tfidf.toarray()]

In [221]:
data_df["extracted_keywords"] = [", ".join(feature_names[np.argsort(doc_tfidf)[-10:][::-1]].tolist()) for doc_tfidf in tfidf.toarray()]

In [None]:
extracted_keywords = []
for doc_tfidf in tfidf.toarray() :
    sorted_ids_top10 = np.argsort(doc_tfidf)[-10:][::-1]
    doc_keywords = feature_names[sorted_ids_top10].tolist()
    # Reducing duplication in keywords
    deduplicated_doc_keywords = list(process.dedupe(doc_keywords, threshold=70))
    final_keywords = ", ".join(deduplicated_doc_keywords)
    extracted_keywords.append(final_keywords)
print(extracted_keywords)

In [233]:
data_df["extracted_keywords"] = extracted_keywords
data_df["extracted_keywords"]

paper_id
1      base alignment models, word alignment, alignme...
2      spectral representations, multiple representat...
3      clustering, defect prediction, using software ...
4      detrimental instances, weighting, detrimental ...
5      concept drift, twitter streams, types drift, l...
                             ...                        
444    chess, games, big data, game data, supporting ...
445    echmm trained, sequences, traces, running, ech...
446    challenging behaviors, autism spectrum disorde...
447    esm data, mental, experience sampling method, ...
448    bnp cluster analysis, anxiety depression, psyc...
Name: extracted_keywords, Length: 448, dtype: object