# Our custom implementation of KeyBERT

In [1]:
!pip install sentence-transformers transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import nltk
import re
import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import MinMaxScaler

In [3]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove retweets
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Remove tags
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove special characters
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Split the tweet into words
    words = tweet.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    clean_tweet = ' '.join(words)
    
    return clean_tweet

In [5]:
def padding(embeddings):
    if embeddings.shape[0] < 768:
        pad_width = ((0, 768 - embeddings.shape[0]))
        embeddings = np.pad(embeddings, pad_width=pad_width, mode='constant', constant_values=0)
    return embeddings

In [6]:
def normalized_data(dfSim):
  scaler = MinMaxScaler()
  dfSim['score_normalized'] = scaler.fit_transform(dfSim[['score']])
  dfSim = dfSim.sort_values(by='score_normalized')
  dfSim.drop('score', axis=1, inplace=True)
  return dfSim

In [8]:
df = pd.read_csv('Olympics_Tokyo_tweets.csv')
max_sample_size = 100
df.dropna(inplace=True)
df.drop(df.index[max_sample_size:], inplace=True)
df['text'] = df['text'].apply(clean_tweet)
df['text']= df['text'].astype(str)
combined_tweets = '. '.join(df['text'])

  df = pd.read_csv('Olympics_Tokyo_tweets.csv')


In [9]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(df['text'])

In [10]:
xlm_model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
docEmbedding = xlm_model.encode(combined_tweets)

Downloading (…)9fcde/.gitattributes:   0%|          | 0.00/795 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)ca1b49fcde/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)1b49fcde/config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading (…)b49fcde/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [11]:
newDF = pd.DataFrame(columns=['feature', 'score'])

In [None]:
for i, feature in enumerate(vectorizer.get_feature_names_out()):
  featureEmbedding = xlm_model.encode(feature)
  cos_sim = util.cos_sim(docEmbedding, featureEmbedding)[0]
  cos_sim = cos_sim.tolist()
  cos_sim = cos_sim[0]
  entry = {'feature': vectorizer.get_feature_names_out()[i], 'score': cos_sim}
  newDF = newDF.append(entry, ignore_index=True)

In [13]:
scaler = MinMaxScaler()
# fit and transform the 'col1' column
newDF['score_normalized'] = scaler.fit_transform(newDF[['score']])
newDF = newDF.sort_values(by='score_normalized', ascending=False)

In [24]:
newDF.head(20)

Unnamed: 0,feature,score,score_normalized
337,mind olympics olympicgames,0.638953,1.0
200,garybolyerart olympics teamusa,0.638811,0.999789
306,love watching olympics,0.636496,0.996366
87,ceremonies years olympics,0.624323,0.97837
393,olympics olympicgames teamusa,0.622451,0.975602
586,things arent olympics,0.621957,0.974871
142,dont olympics people,0.616978,0.967511
660,watching love olympics,0.615736,0.965674
331,mens basketball olympics,0.614861,0.96438
270,jacobwhittle olympics teamgb,0.596324,0.936976


In [17]:
print(len(vectorizer.get_feature_names_out()))

715


# Actual KeyBERT

In [20]:
!pip install keybert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keybert
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.7.0-py3-none-any.whl size=23794 sha256=7e51947d6f69af4103896ffc328a54e197c5465b4a379d43dd4e5a062a78b36e
  Stored in directory: /root/.cache/pip/wheels/66/8d/e6/b0e2f8d883b0fd51819226f67ad9843e04913ce4a97241ff4b
Successfully built keybert
Installing collected packages: keybert
Successfully installed keybert-0.7.0


In [21]:
from keybert import KeyBERT

In [22]:
kw_model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1')

In [23]:
embed = kw_model.extract_embeddings(docs = combined_tweets, candidates=vectorizer.get_feature_names_out(), keyphrase_ngram_range = (3, 3))

In [25]:
doc_embed, word_embed = embed[0], embed[1]

In [26]:
keywords = kw_model.extract_keywords(docs = combined_tweets, candidates=vectorizer.get_feature_names_out(), keyphrase_ngram_range = (3, 3), top_n=20, word_embeddings=word_embed)

In [27]:
print(keywords)

[('mind olympics olympicgames', 0.639), ('garybolyerart olympics teamusa', 0.6388), ('love watching olympics', 0.6365), ('ceremonies years olympics', 0.6243), ('olympics olympicgames teamusa', 0.6225), ('things arent olympics', 0.622), ('dont olympics people', 0.617), ('watching love olympics', 0.6157), ('mens basketball olympics', 0.6149), ('jacobwhittle olympics teamgb', 0.5963), ('reylo olympics abo', 0.5942), ('djokersa supersporttv olympics', 0.5939), ('mesmerising watch olympics', 0.5933), ('olympian talha talib', 0.5886), ('watching mens olympic', 0.5883), ('olympics teamusa fiba', 0.5873), ('horrible olympics going', 0.5872), ('sports olympics include', 0.5841), ('djeuphoric garybolyerart olympics', 0.5826), ('olympics people compete', 0.5804)]


In [28]:
# create a dataframe from the list of tuples
keyBertDF = pd.DataFrame(keywords, columns=['keyphrase', 'score'])

In [29]:
keyBertDF.head(20)

Unnamed: 0,keyphrase,score
0,mind olympics olympicgames,0.639
1,garybolyerart olympics teamusa,0.6388
2,love watching olympics,0.6365
3,ceremonies years olympics,0.6243
4,olympics olympicgames teamusa,0.6225
5,things arent olympics,0.622
6,dont olympics people,0.617
7,watching love olympics,0.6157
8,mens basketball olympics,0.6149
9,jacobwhittle olympics teamgb,0.5963


## Result - On looking through the keyphrases(top 20) generated, we see that they are very similar to each other.