In [16]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import hashlib

from sklearn.metrics import f1_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch

In [None]:
np.random.seed(21)
# score: 0.42617 (first place 0.78451)
root_path = "E:\\IOAI\\kits\\neoai-2025\\broken-bert"

# Data & model setup

In [18]:
val_data_path = f"{root_path}\\val_dataset.csv"
test_data_path = f"{root_path}\\test.csv"

val_df = pd.read_csv(val_data_path)

test_df = pd.read_csv(test_data_path)

In [19]:
tokenizer = AutoTokenizer.from_pretrained("Ilseyar-kfu/broken_bert")

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

val_encodings = tokenizer(
    val_df["text"].to_list(), truncation=True, padding=True, max_length=256
)
val_dataset = Dataset(val_encodings, val_df["labels"].to_list())

texts_2_score = val_df["text"].to_list() + test_df["text"].to_list()

# Model changes

In [23]:
model = AutoModelForSequenceClassification.from_pretrained("Ilseyar-kfu/broken_bert")

old_embedings = model.bert.embeddings.word_embeddings.weight.detach().cpu()

In [24]:
n_embd = old_embedings.shape[1]
zero_rows = (old_embedings == 0).all(dim=1)
zero_indices = torch.nonzero(zero_rows).squeeze().numpy()

In [25]:
token_to_ids = tokenizer.get_vocab()
ids_to_token = {v: k for k, v in token_to_ids.items()}
non_zero_ids_to_token = {v: k for k, v in token_to_ids.items() if v not in zero_indices}

In [26]:
def get_sub_tokens(token, ids_to_token):
    sub_tokens = []
    for idx, sub_token in ids_to_token.items():
        if sub_token in token:
            sub_tokens.append(idx)
    return sub_tokens

In [27]:
new_embeddings = old_embedings

for zero_index in tqdm(zero_indices):
    zero_token = ids_to_token[zero_index]
    sub_tokens = get_sub_tokens(zero_token, non_zero_ids_to_token)
    if len(sub_tokens) != 0:
        mean_embedding = old_embedings[sub_tokens].mean(axis=0)
        new_embeddings[zero_index] = mean_embedding
    else:
        new_embeddings[zero_index] = torch.rand(1, n_embd)

100%|██████████| 12208/12208 [00:14<00:00, 825.85it/s]


In [28]:
model.bert.embeddings.word_embeddings.weight = torch.nn.Parameter(
    torch.Tensor(new_embeddings)
)

# Submission

In [29]:
def evaluate_on_validation(model, tokenizer, df_val):
    label_2_dict = {"LABEL_0": "neutral", "LABEL_1": "positive", "LABEL_2": "negative"}
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    answ = classifier.predict(list(df_val["text"]))
    answ = [label_2_dict[el["label"]] for el in answ]

    # print(f1_score(p.label_ids, preds, average='macro'))
    print(classification_report(df_val["labels"], answ))

In [30]:
evaluate_on_validation(model, tokenizer, val_df)

Device set to use cuda:0


              precision    recall  f1-score   support

    negative       0.68      0.21      0.32       935
     neutral       0.34      0.84      0.48       759
    positive       0.59      0.25      0.35       806

    accuracy                           0.41      2500
   macro avg       0.54      0.43      0.39      2500
weighted avg       0.55      0.41      0.38      2500



In [31]:
def create_submission(model, tokenizer, df_test):
    label_2_dict = {"LABEL_0": "neutral", "LABEL_1": "positive", "LABEL_2": "negative"}
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    answ = classifier.predict(list(df_test["text"]))
    answ = [label_2_dict[el["label"]] for el in answ]

    df = pd.DataFrame({"id": df_test["id"], "labels": answ})
    hsh = hashlib.sha256(df.to_csv(index=False).encode("utf-8")).hexdigest()[:8]
    submit_path = f"submission_{hsh}.csv"
    print(f"SUBMIT_NAME: {submit_path}")
    print(df.head(10))
    df.to_csv(submit_path, index=False)

In [32]:
create_submission(model, tokenizer, test_df)

Device set to use cuda:0


SUBMIT_NAME: submission_5d7417fc.csv
     id    labels
0  5000  positive
1  5001   neutral
2  5002   neutral
3  5003   neutral
4  5004   neutral
5  5005   neutral
6  5006   neutral
7  5007   neutral
8  5008  negative
9  5009   neutral
