# Keyword Extraction

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Fetch preprocessed data
fe_data = pd.read_csv("data/preprocessed.csv", usecols=["Id", "Name", "Language", "Description", "bow"])
fe_data.head()

Unnamed: 0,Id,Name,Language,Description,bow
0,1020396,the gospel of john,eng,what sets this commentary on the fourth gospel...,francis_j._moloney michael_glazier
1,1073868,hanslick on the musically beautiful: sixteen l...,eng,the sixteen lectures by geoffrey payzant in th...,geoffrey_payzant 1-877275-49-2
2,1025976,microserfs,fre,génération x 1018 n° 2508 qui a connu un gros ...,douglas_coupland 10/18
3,1045943,courir avec des ciseaux,fre,roman autobiographique choc courir avec des ci...,augusten_burroughs 10/18
4,1027805,affinités,fre,pour tromper son ennui une demoiselle de la bo...,sarah_waters 10/18


In [3]:
# Find duplicated values
fe_data.shape[0] - fe_data.nunique()

Id                 0
Name              85
Language       34257
Description      435
bow             2879
dtype: int64

---
### Consider only English books

In [4]:
# # Temporary process only english books
fe_data = fe_data[fe_data.Language.isin(["eng", "en-US", "en-GB"])].copy()

---
### Extract keywords from description using keyBERT

In [5]:
from keybert import KeyBERT
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import tqdm as notebook_tqdm
fe_data["keywords"] = fe_data.Description.apply(get_keywords)

In [7]:
fe_data.keywords.head()

0          gospel narrative moloneys fourth commentary
1    musikalischschnen hanslicks hanslick musically...
6                           nun memoir wars jewish war
8    nursery rockinghorse kentuckians anglosaxon ba...
9       railway father engineer perseverance machinery
Name: keywords, dtype: object

In [8]:
# keyword column = book information (bow) + description keywords extracted
fe_data["keywords"] = fe_data[['bow', 'keywords']].fillna('').agg(' '.join, axis=1)
fe_data.drop(['bow', 'Description'], axis = 1, inplace=True)

---
### Remove duplicated book names

In [9]:
fe_data[fe_data.duplicated(subset=["Name"], keep="first")]

Unnamed: 0,Id,Name,Language,keywords
1492,1078605,rising to the occasion,eng,linda_taylor arrow_books_ltd novel cathy hero...
2941,1078370,ainsley harriott's low fat meals in minutes,eng,ainsley_harriott bbc_books ainsleys ainsley l...
3235,1061929,the moon is a harsh mistress,eng,robert_a._heinlein berkley_medallion lehrit r...
3310,1061015,winds of autumn,eng,janette_oke bethany_house_publishers josh sum...
3616,1061923,the moon is a harsh mistress,eng,robert_a._heinlein blackstone_publishing hein...
...,...,...,...,...
31859,1001528,natasha and other stories,eng,david_bezmozgis vintage bermans bella berman ...
32246,1083746,among the thugs,eng,bill_buford w._w._norton_&_company bufords bu...
33626,1041880,the red and the black,eng,stendhal wordsworth_editions priesthood ambit...
34093,1088848,the duke of flatbush,eng,duke_snider zebra baseballs dodgers baseball ...


In [10]:
fe_data = fe_data.drop_duplicates(subset=["Name"], keep='first')

---
### Save final dataset

In [11]:
fe_data.to_csv("data/keywords.csv", sep=",", index=False)