Hi there! In this notebook you would be able to: 
1. Get an embedded layer represantation for every tweet in our data set.
2. Search over all tweets with your input. 

Enjoy!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import csv
import pandas as pd
import scipy


In [None]:
tweets = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")

tweets.head()

In [None]:
# defining our corpus
corpus = tweets['text']
corpus

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


In [None]:
corpus_embeddings = model.encode(corpus)


In [None]:
# make sure your embeddings making sense. 
# Each tweet gets an embedded layer of 768 vectors! This tell us roughly how much text we can feed the model to be effective (~500 words max)
 
corpus_embeddings.shape

In [None]:
# look inside a tweet embedding. 
corpus_embeddings[0]

# Semantic Similarity Search:

Once we have an embedding representation for every tweet in our corpus we can perform similarity tasks.

Applying here three types of search:
1. A custom query to find relevant tweets
2. Connecting a specific tweet to other relevant tweets
3. Clustering all tweets to groups by their distance

# 1. Custom query to find relevent tweets
In the example below, I'm interested to find tweets that talk about vaccinating elderly people.

In [None]:
# defining our query and getting embeddings
queries = ["vaccination elderly patients"]
query_embeddings = model.encode(queries)

# creating search interface with cosine distance
closest_n = 10
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most relevant tweets for your query:")

    for idx, distance in results[0:closest_n]:
        print("(Score: %.4f)" % (1-distance), corpus[idx].strip(), "|| tweet index:", idx )

# 2. Connecting a specific tweet to other relevant tweets


In [None]:
queries = [corpus[200]] #choosing a spacific tweet
query_embeddings = model.encode(queries)

closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Tweet:", query)
    print("\nTop 5 most similar tweets in corpus:")

    for idx, distance in results[1:closest_n]: # range should start with 1 to avoid matching tweet to itself
        print("(Score: %.4f)" % (1-distance), corpus[idx].strip(), "|| tweet index:", idx )

# 3. Mapping all tweets by their distance

This grouping process can be effective for later search and static inferencing with this data.

Btw: I'll be happy to hear suggestions to visualize this input in a meaningful way.

In [None]:

queries = corpus
query_embeddings = model.encode(queries)

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[1:closest_n]:
        print("(Score: %.4f)" % (1-distance), corpus[idx].strip(), "|| tweet index:", idx )