In [1]:
with open("klej_ar/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [2]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        record = doc.strip().split("\t")
        if len(record) != 2:
            continue
        text, target = record
        label = int(float(target))
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [3]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# K-means

```
!pip install kmeans-pytorch
```

In [4]:
from collections import Counter
import torch
from sklearn.metrics import homogeneity_score
from transformers import RobertaModel, HerbertTokenizer
from kmeans_pytorch import kmeans

In [5]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
herbert = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

In [7]:
herbert = herbert.to(DEVICE)

In [8]:
PAD_TOKEN_ID = tokenizer.pad_token_id

def transform_data(docs, max_len, model=herbert):
    tokenized = tokenizer(docs)
    input_ids = tokenized["input_ids"]
    for i, inp in enumerate(input_ids):
        inp_len = len(inp)
        inp = inp[:max_len] + [PAD_TOKEN_ID] * (max_len - inp_len)
        input_ids[i] = inp
    X_raw = torch.LongTensor(input_ids).to(DEVICE)
    with torch.no_grad():
        X = model(X_raw)[0]
    return X

In [9]:
X = transform_data(test_corpus, 120)

In [10]:
X.shape

torch.Size([977, 120, 768])

In [11]:
X = torch.mean(X, dim=1)

In [12]:
X.shape

torch.Size([977, 768])

In [14]:
K = 5
DISTANCE = "euclidean"

In [15]:
cluster_ids, cluster_centers = kmeans(X=X, num_clusters=K, distance=DISTANCE, device=DEVICE)

[running kmeans]: 5it [00:00, 34.24it/s, center_shift=0.932255, iteration=6, tol=0.000100]  

running k-means on cpu..


[running kmeans]: 21it [00:00, 37.23it/s, center_shift=0.000000, iteration=21, tol=0.000100]

In [16]:
cluster_centers

tensor([[-0.7501,  0.2049, -0.1205,  ..., -0.0583,  0.0391, -0.0982],
        [-0.9566, -0.0715,  0.0968,  ..., -0.0259,  0.1986, -0.0875],
        [-1.0970,  0.1626, -0.0283,  ...,  0.0154, -0.1323,  0.0220],
        [-0.5467,  0.3209, -0.1815,  ..., -0.1395,  0.1057, -0.1917],
        [ 0.0822, -0.4587, -1.0303,  ..., -0.8803, -0.4848,  0.6816]])

In [18]:
cluster_ids[:10]

tensor([2, 2, 3, 2, 2, 3, 2, 3, 3, 0])

In [19]:
cluster_ids_list = cluster_ids.tolist()

In [20]:
counter = Counter(zip(test_labels, cluster_ids_list))

In [22]:
len(counter)

24

In [24]:
counter

Counter({(3, 2): 45,
         (4, 2): 90,
         (1, 3): 66,
         (5, 2): 107,
         (5, 3): 93,
         (1, 0): 57,
         (2, 2): 45,
         (3, 3): 44,
         (5, 0): 91,
         (1, 2): 79,
         (4, 3): 53,
         (2, 0): 26,
         (2, 3): 40,
         (4, 0): 51,
         (5, 1): 7,
         (1, 1): 7,
         (3, 0): 41,
         (3, 1): 6,
         (4, 1): 1,
         (2, 1): 5,
         (5, 4): 17,
         (4, 4): 3,
         (2, 4): 2,
         (3, 4): 1})

In [25]:
homogeneity_score(test_labels, cluster_ids_list)

0.012783818729679933

## pykeops

```
!pip install pykeops[full]
```

https://www.kernel-operations.io/keops/_auto_tutorials/kmeans/plot_kmeans_torch.html