In [76]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm

In [77]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [78]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")
model = AutoModel.from_pretrained("microsoft/mdeberta-v3-base")
model.eval()
model = model.to("cuda:0")

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.word_embeddings._weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSeque

In [79]:
train = pd.read_csv("/work/hack/train_dataset.csv")
test = pd.read_csv("/work/hack/test_dataset.csv")

In [80]:
batch_size = 128

In [81]:
train_vectors = []
for i in tqdm(range(0, len(train), batch_size)):
    left, right = i, min(i + batch_size, len(train))
    encoded_input = tokenizer(train['text'].iloc[left:right].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_input = encoded_input.to("cuda:0")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).detach().cpu())
    train_vectors.extend(sentence_embeddings)

  0%|          | 0/141 [00:00<?, ?it/s]

100%|██████████| 141/141 [03:47<00:00,  1.61s/it]


In [82]:
train_vectors[0]

tensor([-4.4176e-02, -5.4999e-02, -2.0382e-02,  4.7547e-02, -2.1778e-01,
        -2.1337e-01, -1.1325e-01, -6.4509e-02, -9.6143e-02, -2.4505e-01,
        -1.1256e-01,  1.8584e-02, -2.8720e+00, -6.9643e-02,  6.2274e-02,
         1.6515e+00,  1.7017e+00, -1.5462e-01, -1.4297e-02, -1.4868e+00,
        -2.0909e-01,  6.4854e-03, -2.3828e+00,  1.1335e-03,  1.0966e-01,
        -6.3492e-02, -1.8410e-01, -1.3188e-01, -1.8072e-01, -3.4738e-01,
        -2.2863e-01, -5.4337e-02, -3.8867e-01,  1.9981e+00, -2.0197e-01,
        -6.4367e-03,  1.7863e-01,  1.5049e-01, -9.4470e-02, -7.8394e-02,
        -1.9021e-01, -9.7756e-02,  7.9620e-02, -7.4524e-02,  2.0910e-01,
         1.0323e-01, -9.1274e-02,  2.3454e+00,  1.5543e-01, -2.5014e-01,
         2.2304e-02,  1.8366e-01,  9.6988e-02, -1.2164e-01, -3.8810e-02,
        -1.4663e+00, -5.5926e-02, -3.3733e-01,  5.3179e-02,  5.2299e-01,
         1.2981e-02, -1.6276e-02,  3.9341e-02, -8.6249e-02, -2.0718e-01,
         7.3246e-01, -1.1070e-01,  2.9733e-02, -3.0

In [83]:
test_vectors = []
for i in tqdm(range(0, len(test), batch_size)):
    left, right = i, min(i + batch_size, len(test))
    encoded_input = tokenizer(test['text'].iloc[left:right].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_input = encoded_input.to("cuda:0")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).detach().cpu())
    test_vectors.extend(sentence_embeddings)

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [00:56<00:00,  1.58s/it]


In [84]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [85]:
train_vectors = np.array(train_vectors)
for i in range(len(train_vectors)):
    train_vectors[i] = train_vectors[i].numpy()
train_res = np.stack(train_vectors).tolist()

  train_vectors = np.array(train_vectors)
  train_vectors = np.array(train_vectors)


In [86]:
test_vectors = np.array(test_vectors)
for i in range(len(test_vectors)):
    test_vectors[i] = test_vectors[i].numpy()
test_res = np.stack(test_vectors).tolist()

  test_vectors = np.array(test_vectors)
  test_vectors = np.array(test_vectors)


In [94]:
neigh = KNeighborsClassifier(n_neighbors=5)

le = LabelEncoder()
train['subject'] = le.fit_transform(train['subject'])
test['subject'] = le.transform(test['subject'])

In [95]:
neigh.fit(train_res, train['subject'].tolist())

KNeighborsClassifier()

In [96]:
y_pred = neigh.predict(test_res)    

In [97]:
from sklearn.metrics import f1_score

In [98]:
f1_score(test['subject'].tolist(), y_pred, average='weighted')

0.06777060263111757