# Keyword Extraction with RAKE and NLTK

In [None]:
path = '/content'

In [None]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%reload_ext autoreload
%autoreload 2
%reload_ext google.colab.data_table
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

In [None]:
files = os.listdir(path); files

['manual-all-domains_99done_Google_Title.xlsx',
 'manual-all-domains_full_data.xlsx']

In [None]:
df_raw = pd.read_excel(os.path.join(path,files[1]), index_col=[0])
df_raw.columns = ['domain','signup_url','other_lang','lang','title']
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2784 entries, 1 to 2784
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   domain      2784 non-null   object
 1   signup_url  2784 non-null   object
 2   other_lang  2784 non-null   object
 3   lang        1273 non-null   object
 4   title       1194 non-null   object
dtypes: object(5)
memory usage: 130.5+ KB


In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def process(text):
  text = str(text)
  text = text.lower()
  text = re.sub(r'[^a-z0-9 ]','',text)
  text = ' '.join([lemmatizer.lemmatize(w, pos='v') for w in text.split()])
  text = ' '.join([lemmatizer.lemmatize(w, pos='n') for w in text.split()])
  text = ' '.join(text.split())
  return text

In [None]:
# xx = df_raw.sample(10, random_state=11)[['title']]
# xx = xx.dropna(subset=['title'])
# xx['title'] = xx['title'].apply(process)
# xx['title'] = xx['title'].replace(r'^\s*$', np.nan, regex=True)
# xx = xx.dropna(subset=['title'])
# xx

In [None]:
df = df_raw.copy()
df = df.dropna(subset=['title'])
df['title'] = df['title'].apply(process)
df['title'] = df['title'].replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(subset=['title'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1086 entries, 1 to 2784
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   domain      1086 non-null   object
 1   signup_url  1086 non-null   object
 2   other_lang  1086 non-null   object
 3   lang        318 non-null    object
 4   title       1086 non-null   object
dtypes: object(5)
memory usage: 50.9+ KB


In [None]:
df.sample(5)

Unnamed: 0,domain,signup_url,other_lang,lang,title
2505,live.com,https://signup.live.com/signup?mkt=en-gb&lic=1...,N,,create account
2099,trainingzone.co.uk,https://www.trainingzone.co.uk/user/register?d...,N,,sign up trainingzone
1786,allthatmatters.com,https://allthatmatters.com/apps/mindbody/login,N,,please log in to your account all that matter ...
2746,cbd.int,https://accounts.cbd.int/signup,N,,account convention on biological diversity
1880,debateafrica.com,http://debateafrica.com/register/,N,,create an account


In [None]:
# !pip install langdetect
# from langdetect import detect
# langs = []
# df['lang'] = 'xx'
# for index, row in df.iterrows():
#   try:
#     df.loc[index,'lang'] = detect(row['title'])
#   except:
#     pass
# langs.append(detect(row['title']))
# pd.Series(langs).value_counts()
# df.to_csv('x.csv')

In [None]:
def top_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df
    
def top_mean_feats(X, features, grp_ids=None, mins=0.1, top_n=25):
  if grp_ids:
      D = X[grp_ids].toarray()
  else:
      D = X.toarray()
  D[D < mins] = 0
  means = np.mean(D, axis=0)
  return top_feats(means, features, top_n)

In [None]:
model_word_1gram = CountVectorizer(analyzer='word', stop_words='english', max_df=1.0, min_df=10, ngram_range=(1,1))
model_word_2gram = CountVectorizer(analyzer='word', stop_words=None, max_df=1.0, min_df=10, ngram_range=(2,2))

In [None]:
matrix_word_1gram = model_word_1gram.fit_transform(df.title.tolist())
matrix_word_2gram = model_word_2gram.fit_transform(df.title.tolist())

In [None]:
features_word_1gram = model_word_1gram.get_feature_names()
features_word_2gram = model_word_2gram.get_feature_names()

In [None]:
top_word_1gram = top_mean_feats(matrix_word_1gram, features_word_1gram, top_n=20)
print(top_word_1gram)

        features     score
0        account  0.226519
1         create  0.151934
2           sign  0.112339
3       register  0.109576
4   registration  0.058011
5            new  0.048803
6         online  0.038674
7           free  0.035912
8          login  0.032228
9         signup  0.030387
10      customer  0.025783
11          shop  0.018416
12          join  0.018416
13          news  0.015654
14         forum  0.014733
15         store  0.013812
16           log  0.013812
17          baby  0.012891
18          user  0.012891
19          site  0.012891


In [None]:
top_word_2gram = top_mean_feats(matrix_word_2gram, features_word_2gram, top_n=20)
print(top_word_2gram)

            features     score
0            sign up  0.087477
1     create account  0.079190
2         create new  0.029466
3            sign in  0.024862
4         an account  0.022099
5             up for  0.019337
6   customer account  0.019337
7       new customer  0.018416
8          create an  0.016575
9        new account  0.015654
10            log in  0.012891
11        my account  0.011050
12       create your  0.010129


In [None]:
from nltk.util import ngrams
def create_chargrams(word):
  gramslist = []
  for n in range(2,6):
    grams = list(ngrams(list(word), n))
    grams = list(set(list(map(''.join, grams))))
    gramslist.extend(grams)
  return gramslist

def chargram_similarity(word1, word2):
  gramlist1 = create_chargrams(word1)
  gramlist2 = create_chargrams(word2)
  gramlistc = list(set(gramlist1).intersection(gramlist2))
  simscore = len(gramlistc)/min(len(gramlist1), len(gramlist2))
  return simscore

In [None]:
print("('signin','sign in'): ", chargram_similarity('signin','sign in'))
print("('signin','create'): ", chargram_similarity('signin','create'))
print("('signin','signup'): ", chargram_similarity('signin','signup'))

('signin','sign in'):  0.5
('signin','create'):  0.0
('signin','signup'):  0.42857142857142855


In [None]:
# !pip install pytextrank
# !python -m spacy download en_core_web_sm
# import spacy
# import pytextrank
text = ' . '.join(df.title.tolist())
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
doc = nlp(text)
for p in doc._.phrases[:10]:
    # print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text)); print(p.chunks)
    print("{:.4f} {}".format(p.rank, p.text))

0.0839 new account
0.0831 new customer account
0.0818 account registration
0.0793 account sign
0.0778 new account joccom
0.0776 new customer account zumiez
0.0776 new customer account generalimprintshopname
0.0776 new customer account playtech
0.0776 new customer account mobileciti
0.0770 account baby


In [None]:
# !pip install summa
# from summa import keywords
# text = ' . '.join(df.title.tolist())
xx = keywords.keywords(text, scores=True, deaccent=True)
xx[:10]

[('account', 0.5546846599427575),
 ('register', 0.42351724134353225),
 ('create', 0.3322145252944762),
 ('sign', 0.2557827345972672),
 ('registration', 0.24678282402332308),
 ('signup', 0.13756182354313182),
 ('free', 0.11879990793662887),
 ('group login', 0.08187352841189384),
 ('join', 0.0781287383478171),
 ('online setiap', 0.07308738841245678)]

In [None]:
# !pip install rake-nltk
# from rake_nltk import Metric, Rake
# rake = Rake(min_length=1, max_length=2, ranking_metric=Metric.WORD_FREQUENCY)
# text = ' . '.join(df.title.tolist())
# rake.extract_keywords_from_text(text)
rake.get_ranked_phrases_with_scores()[:10]

[(61.0, 'account registration'),
 (55.0, 'account login'),
 (45.0, 'free account'),
 (41.0, 'create account'),
 (40.0, 'online account'),
 (40.0, 'new account'),
 (40.0, 'google account'),
 (40.0, 'church account'),
 (40.0, 'account subscribe'),
 (39.0, 'twitchtv account')]

In [None]:
rake = Rake(min_length=1, max_length=1, ranking_metric=Metric.WORD_FREQUENCY)
rake.extract_keywords_from_text(text)
rake.get_ranked_phrases_with_scores()[:10]

[(1.0, 'zoom'),
 (1.0, 'yts'),
 (1.0, 'yovoads'),
 (1.0, 'youmengregistrationyoumenglogin'),
 (1.0, 'yamap'),
 (1.0, 'wwwmlbcom'),
 (1.0, 'wwwmed1de'),
 (1.0, 'wwwkinkcom'),
 (1.0, 'wwwkankensbagscom'),
 (1.0, 'wwwgm25com')]

---

In [None]:
# df_tfidf = pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names())
# print(df_tfidf)

# list(tfidf.vocabulary_.keys())[:10]

# df_tfidf = pd.DataFrame(x[0].T.todense(), index=tfidf.get_feature_names(), columns=["TF-IDF"])
# df_tfidf = df_tfidf.sort_values('TF-IDF', ascending=False)
# print(df_tfidf.head(25))