In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence", )

model = AutoModel.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence")

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

import torch
from torch.utils.data import DataLoader

import pandas as pd

In [None]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
_ = model.to(device)

In [None]:
tweets = pd.read_csv('/kaggle/input/jair-bolsonaro-twitter-data/bolsonaro_tweets.csv')
tweets.shape

In [None]:
tweets.head(1)

# Calculating Sentence Embeddings

You also can use [sentence-transformers](https://github.com/UKPLab/sentence-transformers) package.

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
# ref: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    return sum_embeddings / sum_mask

In [None]:
dl = DataLoader(tweets['text'].tolist(), batch_size=10, shuffle=False)

In [None]:
len(dl), tweets.shape

In [None]:
embeddings = []

for batch in dl:
    encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
    
    encoded_input.to(device)
    
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

        #Perform pooling. In this case, mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        
        embeddings.append(sentence_embeddings.cpu().numpy())

In [None]:
import numpy as np

In [None]:
X = np.concatenate(embeddings)

In [None]:
len(X), len(X[0])

BERT Embeddings has 768 dimensions

# Clustering 

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(X)

In [None]:
kmeans.cluster_centers_

In [None]:
tweets['cluster'] = kmeans.labels_

TSNE to performe dimensionality reduction

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(X)

In [None]:
df = pd.DataFrame(X_embedded)
df.columns = ['X', 'Y']

In [None]:
df.shape

In [None]:
tweets = pd.concat([tweets, df], axis=1)

In [None]:
tweets.head(1)

# Visualization

In [None]:
fig = px.scatter(tweets, x="X", y="Y", color="cluster", hover_data=['text'])
fig.show()