In [1]:
!pip install sentence-transformers transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0

In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import nltk
import re
import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import MinMaxScaler

In [3]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove retweets
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Remove tags
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove special characters
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Split the tweet into words
    words = tweet.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    clean_tweet = ' '.join(words)
    
    return clean_tweet

In [5]:
def padding(embeddings):
    if embeddings.shape[0] < 768:
        pad_width = ((0, 768 - embeddings.shape[0]))
        embeddings = np.pad(embeddings, pad_width=pad_width, mode='constant', constant_values=0)
    return embeddings

In [6]:
def normalized_data(dfSim):
  scaler = MinMaxScaler()
  dfSim['score_normalized'] = scaler.fit_transform(dfSim[['score']])
  dfSim = dfSim.sort_values(by='score_normalized')
  dfSim.drop('score', axis=1, inplace=True)
  return dfSim

In [7]:
def BERT_embeddings(vectorizer, tokenizer, model):
  embeddings = []
  for feature_name in vectorizer.get_feature_names_out():
      tokens = tokenizer(feature_name, return_tensors='pt', padding=True, truncation=True)
      with torch.no_grad():
          output = model(**tokens)
      # Obtain BERT embeddings for [CLS] token
      tensor = output.last_hidden_state[:, 0, :].squeeze()
      tensor = tensor.numpy()
      embeddings.append(tensor)
  return embeddings

In [11]:
df = pd.read_csv('Olympics_Tokyo_tweets.csv')
max_sample_size = 200
df.dropna(inplace=True)
df.drop(df.index[max_sample_size:], inplace=True)
df['text'] = df['text'].apply(clean_tweet)
df['text']= df['text'].astype(str)
combined_tweets = '. '.join(df['text'])

In [12]:
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(df['text'])
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
bertEmbeddings = BERT_embeddings(vectorizer, tokenizer, model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
distilbert_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

Downloading (…)7e0d5/.gitattributes:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0e5ca7e0d5/README.md:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading (…)5ca7e0d5/config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)7e0d5/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading (…)0e5ca7e0d5/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)ca7e0d5/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [14]:
docEmbedding = distilbert_model.encode(combined_tweets)

In [15]:
newDF = pd.DataFrame(columns=['keyphrase', 'score'])

In [None]:
for i, embedding in enumerate(bertEmbeddings):
  cos_sim = util.cos_sim(docEmbedding, embedding)[0]
  cos_sim = cos_sim.tolist()
  cos_sim = cos_sim[0]
  entry = {'keyphrase': vectorizer.get_feature_names_out()[i], 'score': cos_sim}
  newDF = newDF.append(entry, ignore_index=True)

In [17]:
scaler = MinMaxScaler()
# fit and transform the 'col1' column
newDF['score_normalized'] = scaler.fit_transform(newDF[['score']])
newDF = newDF.sort_values(by='score_normalized', ascending=False)

In [18]:
newDF.head(20)

Unnamed: 0,keyphrase,score,score_normalized
157,ceremonies years olympics,0.04742,1.0
310,everything usa olympics,0.036192,0.936313
467,hope india olympics,0.036025,0.935367
746,olympics events live,0.029259,0.89699
258,djeuphoric garybolyerart olympics,0.028824,0.894522
1174,today olympics paralympics,0.023401,0.863763
207,coverage bbc olympics,0.018856,0.837981
1147,thekkw possible olympics,0.01484,0.815202
1175,today tokyo olympics,0.014517,0.813373
727,olympicgames tokyo gazza_jenks,0.013885,0.809787


In [19]:
candidates = ['watching tiktok videos', 'tiktok videos athletes', 'videos athletes olympics', 'athletes olympics village', 'olympics village kinda', 'village kinda makes', 'kinda makes feel', 'makes feel nostalgic', 'feel nostalgic id', 'nostalgic id like', 'x parra kits', 'parra kits something', 'profound slap face', 'slap face corrupt', 'face corrupt world', 'corrupt world professional', 'world professional cycling', 'congrats dr kiesenhofer', 'dr kiesenhofer phd', 'kiesenhofer phd mathematics', 'phd mathematics first', 'mathematics first win', 'first win olympic', 'win olympic medal', 'olympic medal austria', 'tokyo olympics roommates', 'olympics roommates shushiladevi', 'roommates shushiladevi mirabai_chanu', 'shushiladevi mirabai_chanu send', 'mirabai_chanu send emotional', 'send emotional thank', 'emotional thank message', 'thank message manipur', 'rawstory unamerican root', 'unamerican root antivaxxers', 'japan possibly impacting', 'possibly impacting olympic', 'impacting olympic events', 'olympic events nbcdfwweather', 'events nbcdfwweather tokyo', 'anything see rn', 'see rn olympics', 'olympics style worthy', 'style worthy gold', 'worthy gold medal', 'gold medal via enews', 'situation stark sign', 'stark sign financial', 'sign financial difficulties', 'financial difficulties faced', 'difficulties faced bbc', 'faced bbc combine', 'bbc combine high', 'combine high expect', 'anyone watch things', 'watch things arent', 'things arent olympics', 'arent olympics time', 'olympics time year', 'seriously one paying', 'one paying attention', 'paying attention said', 'attention said team', 'said team usa', 'team usa mens', 'usa mens basketball', 'mens basketball going', 'basketball going get', 'going get washed', 'get washed olympics', 'lizziedeignan amazing effort', 'amazing effort earlier', 'effort earlier today', 'earlier today tokyo', 'today tokyo olympics', 'tokyo olympics weekend', 'olympics weekend first', 'weekend first time', 'first time ive', 'fifacom transfermarkt officially', 'transfermarkt officially declared', 'anime music volleyball', 'music volleyball matches', 'volleyball matches wont', 'matches wont impressed', 'wont impressed h', 'come india chances', 'india chances win', 'chances win medals', 'win medals archery', 'medals archery skeet', 'archery skeet tomorrow', 'setting benchmarks olympics', 'benchmarks olympics season', 'wow incredible gymnastic', 'incredible gymnastic career', 'gymnastic career champion', 'career champion olympics', 'champion olympics age', 'bad ass bad', 'markxdavies', 'presume discovered', 'discovered answer', 'answer wot', 'wot rowing', 'rowing coverage', 'coverage question', 'question bbc', 'bbc olympics', 'olympics cover', 'watching olympics finding', 'olympics finding new', 'finding new cars', 'new cars exist', 'cars exist every day', 'shirt skateboard olympics', 'skateboard olympics lol', 'brexit olympics', 'uploaded purple scaredy', 'purple scaredy cat', 'scaredy cat performance', 'cat performance video', 'performance video using', 'video using olympics', 'using olympics sportswear', 'olympics sportswear show', 'sportswear show support', 'tokyo cool good', 'cool good luck', 'ianfmartin alfiejapanorama', 'alfiejapanorama thomaskyhn', 'thomaskyhn olympics', 'olympics continue somehow', 'continue somehow ioc', 'somehow ioc build', 'ioc build maintain', 'like even canoe', 'even canoe slalom', 'canoe slalom stressful', 'slalom stressful watch', 'stressful watch olympics', 'gerekmeinhardt leetothekiefer', 'btw us womens', 'hours uncut sec', 'uncut sec replay', 'sec replay som', 'djeuphoric garybolyerart', 'garybolyerart olympics', 'olympics teamusa fiba', 'teamusa fiba equipefra', 'fiba equipefra hell', 'equipefra hell dont', 'hell dont support', 'dont support america', 'support america enough', 'america enough r', 'team usa way', 'usa way god', 'way god love', 'god love watching', 'love watching olympics', 'team_canada_fan olympics', 'olympics highcascade teamcanada', 'highcascade teamcanada cbcolympics', 'teamcanada cbcolympics full', 'cbcolympics full body', 'full body experience', 'body experience think', 'experience think muscle', 'get woke go', 'woke go broke', 'go broke us', 'broke us men', 'us men lose', 'men lose france', 'lose france mens', 'pride something th', 'ok time think', 'time think ive', 'think ive witnessed', 'ive witnessed road', 'witnessed road race', 'road race olympics', 'last weeks ceremony', 'weeks ceremony threatened', 'ceremony threatened another', 'threatened another virus', 'another virus woke', 'virus woke politics', 'woke politics faithful', 'politics faithful companion', 'faithful companion cancel', 'companion cancel culture', 'cancel culture th', 'proud olympian talha', 'olympian talha talib', 'talha talib power', 'talib power pakistan', 'power pakistan rooting', 'pakistan rooting weightlifting', 'rooting weightlifting olympics', 'hope india olympics', 'india olympics hockey', 'olympics hockey team', 'hockey team takes', 'team takes inspiration', 'takes inspiration mangtec', 'least take lauturo', 'take lauturo romero', 'lauturo romero may', 'romero may seem', 'may seem tired', 'seem tired rest', 'tired rest needs', 'bearish olympics one', 'olympics one usa', 'one usa wants', 'usa wants tune', 'wants tune watch', 'tune watch usa', 'watch usa lose', 'usa lose much', 'talk todays podcast', 'todays podcast carolyn', 'podcast carolyn murray', 'carolyn murray speaks', 'murray speaks hometown', 'speaks hometown athlete', 'hometown athlete raven', 'athlete raven saunders', 'raven saunders training', 'olympics wild tennis', 'wild tennis players', 'tennis players basically', 'players basically win', 'basically win wimbledon', 'win wimbledon win', 'compete make earth', 'make earth better', 'earth better place', 'better place instead', 'next olympics must', 'olympics must wear', 'must wear carvelas', 'ashersmith adam peaty', 'adam peaty katari', 'gregg popovich riding', 'popovich riding decades', 'riding decades ago', 'decades ago since', 'ago since losing', 'since losing tim', 'losing tim duncan', 'tim duncan pop', 'duncan pop pushed', 'pop pushed away', 'pushed away top', 'away top players', 'top players think', 'players think w', 'abortion tried keep', 'tried keep secret', 'abc sports team', 'sports team usa', 'team usa mens', 'usa mens basketball', 'mens basketball loses', 'basketball loses france', 'loses france first', 'france first loss', 'first loss olympics', 'loss olympics since', 'anna kiesenofer wins', 'kiesenofer wins womens', 'wins womens cycling', 'womens cycling road', 'cycling road race', 'road race building', 'race building minute', 'building minute lead', 'minute lead peloton', 'lead peloton extra', 'peloton extra coolness', 'extra coolness ra', 'american women stay', 'women stay carrying', 'stay carrying usa', 'carrying usa backs', 'usa backs olympics', 'gymnastics mesmerising watch', 'mesmerising watch olympics', 'watch olympics tokyo', 'olympics tokyo gbr', 'uttered three words', 'three words nonfan', 'words nonfan spouse', 'nonfan spouse wants', 'spouse wants hear', 'wants hear watching', 'hear watching baseball', 'watching baseball tonight', 'baseball tonight ill', 'tonight ill give', 'ill give lance', 'give lance lynn', 'way know helped', 'know helped family', 'helped family lot', 'family lot said', 'lot said joe', 'said joe schroeder', 'culturecentral wasnt good', 'wasnt good move', 'good move shouldnt', 'move shouldnt missed', 'shouldnt missed olympics', 'hello big google', 'big google doc', 'google doc michigan', 'doc michigan olympics', 'michigan olympics updated', 'olympics updated preview', 'updated preview relatively', 'preview relatively light', 'relatively light night', 'sportscenter ok whats', 'ok whats excuses', 'whats excuses olympics', 'statejmn music got', 'music got played', 'got played olympics', 'played olympics theres', 'olympics theres nothing', 'theres nothing embarrassing', 'paulafr chance happening', 'chance happening paulafr', 'happening paulafr imagine', 'paulafr imagine bbc', 'imagine bbc giving', 'bbc giving fraction', 'trying convince online', 'convince online friends', 'online friends start', 'friends start country', 'start country go', 'country go olympics', 'go olympics watch', 'djstari hey dj', 'hey dj stari', 'dj stari wed', 'stari wed love', 'wed love hear', 'mad bp songs', 'bp songs played', 'songs played olympics', 'diggysinghdeo achieve glory', 'achieve glory world', 'glory world championship', 'world championship asian', 'championship asian games', 'asian games common', 'games common wealth', 'common wealth games', 'wealth games fall', 'watching mens olympic', 'turns canoe slalom', 'canoe slalom apropos', 'slalom apropos metaphor', 'apropos metaphor pandemic', 'metaphor pandemic oh', 'olympics tokyo uci_cycling', 'uci_cycling congratulations anna', 'congratulations anna well', 'anna well done', 'well done girl', 'done girl well', 'girl well done', 'boycott olympics political', 'olympics political reasons', 'political reasons rather', 'reasons rather theyre', 'rather theyre boring', 'white bitches use', 'bitches use cbd', 'use cbd okay', 'cbd okay olympics', 'okay olympics lmfaooo', 'nwpinpdx mpinoe amongst', 'mpinoe amongst german', 'amongst german journalists', 'german journalists covered', 'journalists covered olympics', 'covered olympics claim', 'olympics claim photo', 'claim photo existstaken', 'okay maybe watch', 'clear less olympics', 'less olympics coverage', 'olympics coverage bbc', 'coverage bbc olympics', 'bbc olympics organisers', 'olympics organisers decided', 'organisers decided sell', 'decided sell european', 'media trash woke', 'trash woke olympics', 'woke olympics opening', 'olympics opening ceremony', 'opening ceremony depressing', 'ceremony depressing hell', 'tailorganggang staying away', 'staying away everything', 'away everything usa', 'everything usa olympics', 'usa olympics care', 'olympics care activism', 'care activism patriotism', 'olympics inspiring work', 'inspiring work like', 'work like wont', 'like wont start', 'wont start till', 'start till olympics', 'till olympics also', 'olympics also feel', 'also feel lik', 'nevslin peacock covering', 'peacock covering small', 'covering small olympics', 'small olympics events', 'olympics events live', 'events live early', 'live early mis', 'dad talks tv', 'talks tv olympics', 'tv olympics like', 'olympics like people', 'like people hear', 'people hear cancel', 'hear cancel canoe', 'cancel canoe racing', 'canoe racing something', 'dad yelled wanted', 'yelled wanted see', 'wanted see philippines', 'usa basketball defeated', 'basketball defeated france', 'defeated france first', 'france first olympic', 'first olympic loss', 'need develop strong', 'develop strong opinions', 'strong opinions olympics', 'opinions olympics canoeing', 'chinas yang takes', 'yang takes first', 'takes first gold', 'first gold uneasy', 'gold uneasy tokyo', 'im rooting ultramarins', 'rooting ultramarins olympics', 'raddrambo ayomideyeng dumbass', 'ayomideyeng dumbass would', 'dumbass would think', 'would think lebron', 'think lebron would', 'lebron would lose', 'would lose france', 'lose france olympics', 'bbcsport hours olympics', 'hours olympics coverage', 'olympics coverage seems', 'coverage seems comprise', 'seems comprise rabbiting', 'comprise rabbiting repeats', 'tennis saniaankita tokyo', 'saniaankita tokyo losing', 'tokyo losing winning', 'losing winning position', 'teamusa rocky start', 'rocky start today', 'spains jon rahm', 'jon rahm olympics', 'rahm olympics positive', 'olympics positive covid', 'positive covid test', 'covid test golf', 'bambam olympic bambam', 'bambam olympicstatman first', 'olympicstatman first olympics', 'first olympics watched', 'olympics watched live', 'watched live tv', 'live tv montreal', 'tv montreal del', 'montreal del mar', 'del mar calif', 'mar calif time', 'calif time uncle', 'time uncle home', 'niggahs trash', 'trying watch olympics', 'watch olympics tv', 'olympics tv love', 'tv love jesus', 'olympics issf_shooting', 'issf_shooting new rainbowsiege', 'new rainbowsiege operator', 'rainbowsiege operator reveal', 'ive strained arms', 'strained arms looking', 'arms looking weightlifting', 'looking weightlifting olympics', 'absphysio jackachew im', 'jackachew im sure', 'im sure many', 'sure many people', 'many people enjoy', 'people enjoy thats', 'enjoy thats beauty', 'thats beauty olympics', 'beauty olympics something', 'olympics something everyone', 'absolutely amazing annakiesenhofer', 'amazing annakiesenhofer gold', 'annakiesenhofer gold austria', 'gold austria olympics', 'tunisias ahmed hafnaoui', 'ahmed hafnaoui stuns', 'hafnaoui stuns field', 'stuns field win', 'field win mens', 'win mens freestyle', 'mens freestyle gold', 'freestyle gold via', 'gold via mailsport', 'nigeria hosted olympics', 'hosted olympics michael', 'olympics michael phelps', 'michael phelps comes', 'phelps comes pool', 'comes pool ready', 'pool ready swim', 'ready swim find', 'swim find people', 'everyone thinking ill', 'thinking ill say', 'ill say itbadminton', 'say itbadminton best', 'itbadminton best sport', 'best sport olympics', 'sport olympics badminton', 'olympics badminton olympics', 'smith falls ont', 'falls ont olympic', 'ont olympic games', 'olympic games tokyo', 'games tokyo childhood', 'tokyo childhood friends', 'childhood friends brooke', 'friends brooke henderson', 'brooke henderson bailey', 'henderson bailey andison', 'kysportsradio whats one', 'whats one sec', 'one sec olympics', 'sec olympics didnt', 'olympics didnt know', 'didnt know sec', 'know sec olympics', 'new top story', 'top story time', 'nothing makes feel', 'makes feel better', 'feel better watching', 'better watching yo', 'watching yo kids', 'yo kids compete', 'kids compete olympics', 'mad respect talha', 'respect talha talib', 'talha talib weightlifter', 'talib weightlifter represented', 'weightlifter represented pakistan', 'represented pakistan olympics', 'pakistan olympics one', 'fyp tiktok olympics', 'tiktok olympics stuff', 'slept hard tank', 'hard tank top', 'tank top titties', 'top titties competing', 'titties competing olympics', 'umeshgeeta udaysrana guess', 'udaysrana guess became', 'guess became complacent', 'became complacent argentina', 'complacent argentina lost', 'complacent argentina lost', 'argentina lost couple', 'lost couple olympics', 'rollkurldr way succeed', 'way succeed righ', 'maybe didnt hate', 'didnt hate country', 'hate country much', 'country much might', 'much might chance', 'read heres quick', 'heres quick wrap', 'quick wrap south', 'wrap south africas', 'south africas opening', 'africas opening weekend', 'opening weekend olympicgames', 'weekend olympicgames tokyo', 'olympicgames tokyo gazza_jenks', 'sorry hear many', 'hear many usa', 'many usa teams', 'usa teams losing', 'teams losing world', 'losing world stage', 'world stage knelt', 'stage knelt national', 'knelt national anthem', 'wait im seeing', 'im seeing black', 'seeing black women', 'black women dominating', 'women dominating swimming', 'dominating swimming usa', 'swimming usa forget', 'usa forget everything', 'forget everything ive', 'everything ive said', 'ive said thread', 'tomansmsby swear im', 'swear im actually', 'im actually living', 'actually living haikyuu', 'living haikyuu x', 'haikyuu x olympics', 'x olympics fanart', 'olympics fanart theyre', 'fanart theyre good', 'canadian women bringing', 'women bringing home', 'bringing home hardware', 'home hardware winning', 'hardware winning two', 'relay simone manuel', 'simone manuel e', 'flip olympics cjvosters', 'olympics cjvosters calling', 'cjvosters calling canoe', 'calling canoe slalom', 'canoe slalom actually', 'slalom actually spent', 'actually spent summer', 'spent summer isle', 'summer isle royale', 'isle royale preparing', 'win gold olympics', 'gold olympics backstroke', 'mann ki baat', 'ki baat pm', 'baat pm modi', 'pm modi talks', 'modi talks tokyo', 'talks tokyo olympics', 'tokyo olympics independence', 'olympics independence day', 'independence day celebrations', 'france fuck olympics', 'fuck olympics lost', 'olympics lost france', 'teamcanada swimmingcanada fina', 'swimmingcanada fina mags_swims', 'fina mags_swims oleksiakpenny', 'mags_swims oleksiakpenny taylor_ruck', 'oleksiakpenny taylor_ruck tokyo', 'taylor_ruck tokyo olympics', 'tokyo olympics really', 'olympics really impress', 'ononokomachi many sports', 'many sports cutoffs', 'sports cutoffs qualify', 'cutoffs qualify olympics', 'qualify olympics including', 'olympics including gymnastics', 'including gymnastics many', 'gymnastics many countries', 'many countries q', 'favourite image day', 'image day emotional', 'day emotional olympics', 'emotional olympics jeuxolympiques', 'olympics jeuxolympiques tokyo', 'jeuxolympiques tokyo skateboarding', 'tokyo skateboarding st', 'skateboarding st time', 'best luck great', 'luck great sailors', 'great sailors competing', 'sailors competing olympics', 'competing olympics japan', 'olympics japan next', 'japan next couple', 'next couple weeks', 'couple weeks follow', 'highranking', 'ashleigh barty eliminated', 'barty eliminated olympics', 'eliminated olympics firstround', 'sportpsysanika devastating watching', 'devastating watching especially', 'watching especially today', 'especially today olympics', 'today olympics paralympics', 'olympics paralympics definitely', 'paralympics definitely need', 'definitely need co', 'jschmukler bien por', 'bien por judefobo', 'por judefobo algerian', 'judefobo algerian suspended', 'algerian suspended world', 'suspended world judo', 'world judo refusal', 'judo refusal face', 'refusal face israeli', 'face israeli olympics', 'iamuwaomawisdom youll turn', 'youll turn relationship', 'turn relationship sex', 'relationship sex olympics', 'sport favorite olympics', 'favorite olympics mine', 'olympics reporting theeratee', 'reporting theeratee enit', 'theeratee enit njan', 'enit njan thanne', 'njan thanne orenam', 'than orenam host', 'orenam host cheyam', 'girl boss gold', 'would last seconds', 'last seconds canoeslalom', 'seconds canoeslalom course', 'canoeslalom course looks', 'course looks fun', 'looks fun olympics', 'fun olympics tokyo', 'seriously today never', 'today never noticed', 'never noticed song', 'noticed song played', 'song played olympics', 'iam_johnw fucking olympics', 'johnw fucking olympics', 'tburnsatlanta olympics tokyo happily', 'olympics tokyo happily surprised', 'gold silver bronze', 'silver bronze medalist', 'bronze medalist sprinting', 'medalist sprinting tokyo', 'sprinting tokyo olympics', 'watching events like', 'events like canoe', 'like canoe salom', 'canoe salom road', 'salom road cycling', 'road cycling air', 'cycling air rifle', 'air rifle etc', 'rifle etc love', 'etc love olympics', 'olympics pumped women', 'lets goooo tokyodoge', 'new national hero', 'national hero talhatalib', 'hero talhatalib olympics', 'talhatalib olympics cant', 'olympics cant proud', 'like olympics bc', 'olympics bc every', 'bc every time', 'every time turn', 'time turn get', 'turn get see', 'get see people', 'see people dedicated', 'people dedicated lives', 'dedicated lives sport', 'lives sport ive', 'sport ive n', 'woke see yuto', 'see yuto first', 'yuto first gold', 'first gold medal', 'gold medal skateboarding', 'medal skateboarding olympics', 'skateboarding olympics letsgoooo', 'brienrea cnbc one', 'cnbc one favorite', 'one favorite things', 'favorite things summer', 'things summer olympics', 'summer olympics water', 'olympics water polo', 'cavershamshep alstewartobe teamgb', 'alstewartobe teamgb olympics', 'teamgb olympics csjdujardin', 'olympics csjdujardin bbcsport', 'csjdujardin bbcsport good', 'bbcsport good job', 'good job gbnews', 'job gbnews dont', 'gbnews dont rights', 'paperhat comprehensiveschool amsterdam', 'comprehensiveschool amsterdam artandcraftsclass', 'amsterdam artandcraftsclass origami', 'artandcraftsclass origami still', 'origami still work', 'gooner_saad aese log', 'aese log reh', 'log reh jaty', 'reh jaty hein', 'jaty hein olympics', 'hein olympics se', 'watched yearold woman', 'yearold woman win', 'woman win gold', 'win gold medal', 'gold medal cycling', 'medal cycling olympics', 'cycling olympics ph', 'olympics ph cambridge', 'nothing lazy anna', 'lazy anna kiesenhofer', 'anna kiesenhofer amazing', 'kiesenhofer amazing race', 'kayaking olympic sport', 'olympic sport idea', 'sport idea kayaking', 'idea kayaking olympics', 'hear bar_built olympics', 'bar_built olympics supplies', 'olympics supplies last', 'supplies last cheer', 'last cheer team', 'cheer team usa', 'team usa mixed', 'usa mixed box', 'mixed box built', 'box built top', 'built top f', 'chipsandgist hoopshype im', 'hoopshype im sure', 'im sure hes', 'sure hes paid', 'hes paid something', 'paid something part', 'something part time', 'part time gig', 'time gig mean', 'gig mean well', 'mean well see', 'well see likes', 'years indigenous alaskans', 'indigenous alaskans hosted', 'alaskans hosted olympics', 'badminton seem easy', 'seem easy play', 'easy play back', 'play back yard', 'back yard vicious', 'yard vicious olympics', 'dominant performance annakiesenhofer', 'performance annakiesenhofer go', 'van vleuten mistakes', 'vleuten mistakes silve', 'nba heres robot', 'heres robot overlord', 'robot overlord hitting', 'overlord hitting free', 'hitting free throws', 'free throws olympics', 'slalom canoeing looks', 'canoeing looks like', 'looks like difficult', 'like difficult event', 'difficult event olympics', 'simone biles advances', 'biles advances events', 'advances events usa', 'events usa finishes', 'usa finishes behind', 'finishes behind roc', 'behind roc womens', 'roc womens gymnastics', 'womens gymnastics qualifiers', 'us loses france', 'loses france game', 'france game olympic', 'game olympic win', 'olympic win streak', 'win streak ends', 'actually olympics greece', 'olympics greece turkey', 'greece turkey compete', 'turkey compete see', 'compete see truly', 'see truly hellenic', 'olympics nd round', 'nd round djokovic', 'round djokovic vs', 'djokovic vs struff', 'vs struff july', 'struff july th', 'july th around', 'th around pm', 'around pm local', 'pm local time', 'local time los', 'time los angeles', 'los angeles nymiami', 'could possibly go', 'possibly go wrong', 'people wfh go', 'wfh go olympics', 'go olympics sa', 'olympics sa would', 'sa would bring', 'would bring home', 'bring home', 'bring home atleast', 'home atleast gold', 'atleast gold medals', 'child wanted swim', 'wanted swim olympics', 'economics geopolitics hosting', 'geopolitics hosting olympicsan', 'hosting olympicsan incisive', 'olympicsan incisive article', 'incisive article dealbook', 'article dealbook new', 'dealbook new york', 'new york times', 'got olympics games', 'olympics games every', 'games every fuckin', 'every fuckin thing', 'jeauxvaughn could watch', 'could watch rowing', 'watch rowing tennis', 'rowing tennis soccer', 'tennis soccer teams', 'soccer teams expensive', 'teams expensive cable', 'expensive cable hoops', 'lawyer', 'olympics game germany', 'game germany brazil', 'germany brazil saw', 'brazil saw goals', 'saw goals beautiful', 'goals beautiful thin', 'beautiful thin line', 'thin line small', 'line small mistakes', 'small mistakes cen', 'manipuris bringing olympics', 'bringing olympics laurels', 'olympics laurels call', 'laurels call upon', 'call upon pmoindia', 'upon pmoindia amp', 'pmoindia amp union', 'amp union minister', 'union minister law', 'minister law kirenrijiju', 'law kirenrijiju start', 'bad list olympics', 'list olympics boxing', 'olympics boxing eltrenramirez', 'said one guy', 'one guy knows', 'guy knows hard', 'knows hard work', 'hard work grow', 'work grow farm', 'grow farm like', 'farm like yo', 'like yo isnt', 'yo isnt allowed', 'half shit put', 'shit put olympics', 'put olympics even', 'olympics even come', 'rowlandrivals world caught', 'world caught rosters', 'caught rosters filled', 'rosters filled w', 'filled w nba', 'w nba players', 'nba players amp', 'players amp dont', 'amp dont send', 'dont send best', 'send best cycl', 'first time since', 'time since us', 'since us mens', 'us mens basketball', 'mens basketball team', 'basketball team lost', 'team lost olympics', 'lost olympics americans', 'olympics americans quest', 'americans quest fo', 'tokyo olympics france', 'olympics france beat', 'france beat usa', 'end unbeaten run', 'unbeaten run via', 'via djstari hii', 'hii dj stari', 'dj stari wed', 'stari wed love', 'wed love hear', 'love hear dontgoyet', 'hear dontgoyet camila_cabello', 'dontgoyet camila_cabello playing', 'camila_cabello playing olympics', 'playing olympics jamming', 'olympics jamming wi', 'jamming wi hoopfeed', 'wi hoopfeed maybe', 'hoopfeed maybe learn', 'maybe learn shoot', 'learn shoot next', 'shoot next olympics', 'next olympics certainly', 'olympics certainly gave', 'certainly gave great', 'gave great effort', 'great effort us', 'effort us lawmakers', 'us lawmakers accuse', 'lawmakers accuse ioc', 'accuse ioc setting', 'ioc setting dark', 'setting dark precedent', 'dark precedent china', 'precedent china winter', 'china winter olympics', 'winter olympics defending', 'olympics defending champion', 'defending champion andy', 'champion andy murray', 'andy murray withdraws', 'murray withdraws tennis', 'withdraws tennis singles', 'tennis singles event', 'singles event tokyo', 'event tokyo olympics', 'tokyo olympics already', 'olympics already beat', 'vluten admitted didnt', 'admitted didnt know', 'didnt know kiesenhofer', 'know kiesenhofer still', 'kiesenhofer still crazy', 'still crazy mcc', 'crazy mcc views', 'mcc views fucking', 'views fucking olympics', 'fucking olympics hero', 'olympics hero nation', 'hero nation u', 'nation u made', 'u made us', 'made us proud', 'us proud never', 'proud never though', 'never though pakistan', 'though pakistan would', 'pakistan would ever', 'would ever even', 'would ever even', 'even participate olympics', 'advil like swimmer', 'swimmer syria olympics', 'olympics team tend', 'tend root us', 'win judo golds', 'golds olympics tokyo_doge', 'tokyo_doge elonmusk mayemusk', 'mayemusk olympics tokyo', 'tokyo looking foward', 'looking foward one', 'foward one thread', 'one thread ever', 'thread ever steeped', 'ever steeped skateboard', 'steeped skateboard many', 'skateboard many bad', 'many bad takes', 'bad takes favorite', 'takes favorite people', 'favorite people think', 'people think sk sport', 'sport sportnews bradly', 'sportnews bradly sinden', 'sinden team gb', 'gb taekwondo star', 'taekwondo star vows', 'star vows learn', 'vows learn gold', 'learn gold nearmiss', 'gold nearmiss perky_shadows', 'perky_shadows charliepeagle worldwidewob', 'charliepeagle worldwidewob lol', 'worldwidewob lol theres', 'lol theres difference', 'theres difference tough', 'difference tough defense', 'tough defense mma', 'defense mma call', 'mma call whitewater', 'call whitewater rafting', 'whitewater rafting olympics', 'olympics new favorite', 'new favorite event', 'favorite event best', 'event best events', 'best events summer', 'events summer olympics', 'olympics track gymnastics', 'gymnastics swimmingdiving olympics', 'swimmingdiving olympics teamusa', 'olympics teamusa fiba', 'teamusa fiba equipefra', 'fiba equipefra fact', 'equipefra fact jrue', 'fact jrue put', 'jrue put numbers', 'put numbers less', 'numbers less days', 'less days finals', 'days finals ru omg', 'finals ru omg dad', 'omg dad watched', 'dad watched olympics', 'watched olympics morning', 'olympics morning yelling', 'morning yelling tv', 'yelling tv clapping', 'tv clapping hard', 'clapping hard woke', 'hard woke ok', 'woke ok shotaro', 'ok shotaro probably', 'shotaro probably busy', 'probably busy watching', 'busy watching olympics', 'watching olympics sungchan', 'olympics sungchan yall', 'sungchan yall annoying', 'yall annoying making', 'annoying making fuss', 'making fuss song', 'fuss song playing', 'song playing olympics', 'playing olympics yall', 'playing olympics yall', 'withering heat watch', 'heat watch katie', 'watch katie ledecky', 'katie ledecky get', 'ledecky get first', 'get first chance', 'first chance medal', 'olympics cameronvdburgh dude', 'cameronvdburgh dude definitely', 'dude definitely shape', 'definitely shape man', 'shape man keep', 'man keep getting', 'keep getting goals', 'keep getting goals', 'tropical storm nepartak', 'storm nepartak heads', 'nepartak heads toward', 'heads toward japan', 'toward japan possibly', 'japan possibly impacting', 'possibly impacting tokyoolympic', 'impacting tokyoolympic events', 'gabbylogan piersmorgan yes', 'piersmorgan yes enforced', 'yes enforced shielding', 'enforced shielding govt', 'shielding govt letting', 'govt letting covid', 'letting covid rip', 'covid rip someth', 'think id enjoying', 'rogershelps hahahahahahaha yes', 'hahahahahahaha yes happening', 'yes happening since', 'happening since switched', 'since switched ignite', 'switched ignite ago', 'rediscovering highs lows', 'highs lows competitive', 'lows competitive athletics', 'competitive athletics spending', 'athletics spending free', 'spending free time', 'free time watching', 'basketball fans unfavorably', 'fans unfavorably comparing', 'unfavorably comparing kevin', 'comparing kevin durant', 'kevin durant lebron', 'durant lebron james', 'lebron james compared', 'james compared alternative', 'compared alternative claim', 'alternative claim k', 'nothing makes feel', 'makes feel less', 'feel less accomplished', 'less accomplished hangoverwatching', 'accomplished hangoverwatching olympics', 'hangoverwatching olympics sunday', 'olympics sunday afternoon', 'sunday afternoon tokyo', 'olympics day currently', 'day currently trending', 'currently trending ireland', 'trending ireland leothesewerrat', 'ireland leothesewerrat galvmay', 'leothesewerrat galvmay shane', 'galvmay shane walsh', 'shane walsh olympics', 'walsh olympics k', 'olympics k kerryvcork', 'k kerryvcork olympics', 'kerryvcork olympics thats', 'olympics thats really', 'thats really impressive', 'really impressive yungsnaku', 'impressive yungsnaku olympics', 'yungsnaku olympics su', 'tokyo olympics usa', 'olympics usa shocked', 'usa shocked france', 'shocked france mens', 'france mens olympic', 'mens olympic basketball', 'olympic basketball tokyo', 'basketball tokyo olympics', 'tokyo olympics usa', 'olympics usa shocked', 'usa shocked france', 'shocked france mens', 'france mens olympic', 'mens olympic basketball', 'olympic basketball tokyo', 'basketball tokyo olympics', 'tokyo olympics news', 'olympics news times', 'news times indiaviva', 'times indiaviva franc', 'indiaviva franc watching', 'franc watching olympic', 'watching olympic cycling', 'olympic cycling like', 'cycling like seeing', 'like seeing long', 'seeing long lost', 'long lost friend', 'lost friend olympics', 'friend olympics atari', 'olympics atari okay', 'atari okay id', 'okay id forgotten', 'id forgotten much', 'forgotten much like', 'much like olympics', 'like olympics giladerdan', 'mayemusk olympics tokyo', 'olympics tokyo looking', 'tokyo looking forward', 'looking forward presale', 'forward presale cant', 'presale cant wait']

In [20]:
keyphrases = []
for idx, k in newDF.iterrows():
  keyphrases.append(k['keyphrase'])

In [23]:
s1 = set(candidates)
s2 = set(keyphrases)
print(len(s1), len(s2), len(s1 & s2))

1279 1376 837


In [24]:
print(len(s1 & s2) / len(s1))

0.6544175136825645
