In [49]:
from transformers import BertModel, BertTokenizer
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, classification_report
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import numpy as np
import torch

In [50]:
print("Loading models...", end="")
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).cuda().eval()

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("BAAI/bge-m3")


# # Load model directly
# from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("Tochka-AI/ruRoPEBert-e5-base-512", trust_remote_code=True)
# model = AutoModel.from_pretrained("Tochka-AI/ruRoPEBert-e5-base-512", trust_remote_code=True, attn_implementation='eager').cuda()

def get_sentence_embedding(sentence: str) -> torch.Tensor:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=256)
    
    inputs['input_ids'] = inputs['input_ids'].cuda()
    inputs['token_type_ids'] = inputs['token_type_ids'].cuda()
    inputs['attention_mask'] = inputs['attention_mask'].cuda()

    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        
    return embedding


# def get_sentence_embedding(sentence: str) -> torch.Tensor:
#     with torch.no_grad():
#         embedding = model.encode(sentence)
        
#     return embedding

# def get_sentence_embedding(sentence: str) -> torch.Tensor:
#     inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
#     inputs['input_ids'] = inputs['input_ids'].cuda()
#     inputs['attention_mask'] = inputs['attention_mask'].cuda()

#     with torch.inference_mode():
#         pooled_output = model(**inputs).pooler_output[0].cpu().numpy()
        
#     return pooled_output

Loading models...

In [51]:
train = pd.read_excel("../data/train/merged_executed.xlsx")
generated = pd.read_excel("../data/train/merged_executed_generated.xlsx")

In [52]:
generated["author_comment"][0]

'Ошибка в открытых тестах. \n\nОбратите внимание, что цвет должен соответствовать условиям задания.'

In [53]:
real_train_answer_embeddings = np.stack(train.author_comment.progress_apply(get_sentence_embedding).to_numpy())
real_y = np.ones(real_train_answer_embeddings.shape[0])

real_train_answer_embeddings.shape, real_y.shape

100%|██████████| 347/347 [00:02<00:00, 163.83it/s]


((347, 768), (347,))

In [54]:
fake_train_answer_embeddings = np.stack(generated.author_comment.progress_apply(get_sentence_embedding).to_numpy())
fake_y = np.zeros(fake_train_answer_embeddings.shape[0])

fake_train_answer_embeddings.shape, fake_y.shape

100%|██████████| 676/676 [00:04<00:00, 164.75it/s]


((676, 768), (676,))

In [55]:
real_train_answer_embeddings.shape, fake_train_answer_embeddings.shape

((347, 768), (676, 768))

In [56]:
real_y.shape, fake_y.shape

((347,), (676,))

In [57]:
X, y = np.vstack([real_train_answer_embeddings, fake_train_answer_embeddings]), np.vstack([real_y.reshape(-1, 1), fake_y.reshape(-1, 1)])
X.shape, y.shape

((1023, 768), (1023, 1))

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [72]:
y.mean()

0.3391984359726295

# Feature wise

In [80]:
reranker = CatBoostClassifier(
    random_state=42,
    verbose=10,
    eval_metric="AUC",
    class_weights=[0.3, 0.7]
)

In [81]:
reranker.fit(
    X_train,
    y_train,
    use_best_model=True,
    eval_set=(X_test, y_test),
)

Learning rate set to 0.030155
0:	test: 0.7047619	best: 0.7047619 (0)	total: 37.6ms	remaining: 37.6s
10:	test: 0.8073545	best: 0.8126455 (9)	total: 186ms	remaining: 16.7s
20:	test: 0.8127513	best: 0.8153968 (13)	total: 346ms	remaining: 16.1s
30:	test: 0.8258730	best: 0.8268254 (29)	total: 510ms	remaining: 15.9s
40:	test: 0.8292593	best: 0.8329630 (38)	total: 678ms	remaining: 15.9s
50:	test: 0.8373016	best: 0.8397354 (49)	total: 852ms	remaining: 15.9s
60:	test: 0.8456614	best: 0.8462963 (59)	total: 1.05s	remaining: 16.2s
70:	test: 0.8493651	best: 0.8493651 (70)	total: 1.24s	remaining: 16.2s
80:	test: 0.8502116	best: 0.8513757 (72)	total: 1.42s	remaining: 16.2s
90:	test: 0.8542328	best: 0.8549735 (89)	total: 1.62s	remaining: 16.2s
100:	test: 0.8528571	best: 0.8555026 (91)	total: 1.81s	remaining: 16.2s
110:	test: 0.8586772	best: 0.8586772 (110)	total: 2.01s	remaining: 16.1s
120:	test: 0.8565608	best: 0.8586772 (110)	total: 2.2s	remaining: 16s
130:	test: 0.8614286	best: 0.8614286 (130)	tota

<catboost.core.CatBoostClassifier at 0x7fe99cad1720>

In [82]:
y_pred = reranker.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred = reranker.predict_proba(X_test)[:, 1]
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         0.0       0.87      0.81      0.84       135
         1.0       0.67      0.76      0.71        70

    accuracy                           0.79       205
   macro avg       0.77      0.78      0.77       205
weighted avg       0.80      0.79      0.79       205

roc_auc_score: 0.8675661375661377


In [83]:
reranker.save_model("ranker_weighted.bin")

In [84]:
from catboost import CatBoostClassifier


class Ranker:
    def __init__(self, model_path: str = "ranker", device: str = "cpu"):
        self.device = device
        
        self.model_name = "DeepPavlov/rubert-base-cased-sentence"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.model = BertModel.from_pretrained(self.model_name).to(self.device).eval()

        self.ranker = CatBoostClassifier(
            random_state=42,
            verbose=10,
        )
        self.ranker.load_model(model_path)


    def get_sentence_embedding(self, sentence: str) -> torch.Tensor:
        inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=256)
        
        inputs['input_ids'] = inputs['input_ids'].to(self.device)
        inputs['token_type_ids'] = inputs['token_type_ids'].to(self.device)
        inputs['attention_mask'] = inputs['attention_mask'].to(self.device)
    
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
        return embedding

    def predict_proba(self, sentence):
        embedding = self.get_sentence_embedding(sentence)
        return self.ranker.predict_proba(embedding)[:, 1][0]

In [85]:
!ls /home/jovyan/novitskiy/HSE-AI-Assistant-Hack/notebooks

catboost_info			 rag.ipynb
dataset.ipynb			 rag_submission.ipynb
dataset_sandbox.ipynb		 ranker
gigachat.ipynb			 ranker.bin
gigachat_baseline.ipynb		 ranker_weighted.bin
gigachat_lite_baseline.ipynb	 reranker.ipynb
gigachat_pro_best_yandex.ipynb	 yandexgpt.ipynb
injection_data_generation.ipynb  yandexgpt_pro_baseline.ipynb
ood_catboost.ipynb


In [89]:
ranker_api = Ranker(model_path="ranker_weighted.bin", device="cpu")

In [90]:
ranker_api.predict_proba('Ошибка в открытых тестах. \n\nОбратите внимание на неверный оператор сравнения — необходимо проверить, что цвет не находится в списке cite_project.')

0.5981571171036825

In [91]:
ranker_api.predict_proba('Ошибка в открытых тестах. \n\nОбратите внимание, что цвет должен соответствовать условиям задания.')

0.03837912536922591

# Embedding features

In [30]:
reranker = CatBoostClassifier(
    random_state=42,
    verbose=10
)

In [32]:
X_train_df = pd.DataFrame({"embeddings": list(X_train)})
X_test_df = pd.DataFrame({"embeddings": list(X_test)})

In [33]:
reranker.fit(
    X_train_df,
    y_train,
    eval_set=(X_test_df, y_test),
    embedding_features=["embeddings"]
)

Learning rate set to 0.030155
0:	learn: 0.6865625	test: 0.6858569	best: 0.6858569 (0)	total: 1.09ms	remaining: 1.09s
10:	learn: 0.6233370	test: 0.6110305	best: 0.6110305 (10)	total: 8.75ms	remaining: 787ms
20:	learn: 0.5896891	test: 0.5680563	best: 0.5680563 (20)	total: 14.6ms	remaining: 682ms
30:	learn: 0.5706738	test: 0.5427326	best: 0.5427326 (30)	total: 20.5ms	remaining: 641ms
40:	learn: 0.5593314	test: 0.5266380	best: 0.5266380 (40)	total: 26.2ms	remaining: 614ms
50:	learn: 0.5500816	test: 0.5179956	best: 0.5179956 (50)	total: 32.3ms	remaining: 601ms
60:	learn: 0.5439177	test: 0.5115155	best: 0.5115155 (60)	total: 38.6ms	remaining: 594ms
70:	learn: 0.5393781	test: 0.5097120	best: 0.5097120 (70)	total: 44.6ms	remaining: 583ms
80:	learn: 0.5354669	test: 0.5078121	best: 0.5070907 (78)	total: 50.7ms	remaining: 575ms
90:	learn: 0.5325943	test: 0.5060508	best: 0.5060508 (90)	total: 56.6ms	remaining: 566ms
100:	learn: 0.5302065	test: 0.5067973	best: 0.5055646 (94)	total: 62.6ms	remaining

<catboost.core.CatBoostClassifier at 0x7f8241b5df30>

In [34]:
y_pred = reranker.predict(X_test_df)
print(classification_report(y_test, y_pred))
y_pred = reranker.predict_proba(X_test_df)[:, 1]
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         0.0       0.78      0.81      0.80       135
         1.0       0.61      0.56      0.58        70

    accuracy                           0.73       205
   macro avg       0.69      0.69      0.69       205
weighted avg       0.72      0.73      0.72       205

roc_auc_score: 0.8229100529100529


# Text features

In [35]:
train = pd.read_excel("../data/train/merged_executed.xlsx")[["author_comment"]]
generated = pd.read_excel("../data/train/merged_executed_generated.xlsx")[["author_comment"]]

In [36]:
real_y = np.ones(train.shape[0])
fake_y = np.zeros(generated.shape[0])

In [38]:
X = train.append(generated)
y = np.vstack([real_y.reshape(-1, 1), fake_y.reshape(-1, 1)])

  X = train.append(generated)


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [40]:
reranker = CatBoostClassifier(
    random_state=42,
    verbose=10
)

In [41]:
reranker.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    text_features=["author_comment"]
)

Learning rate set to 0.030155
0:	learn: 0.6848833	test: 0.6872912	best: 0.6872912 (0)	total: 11.9ms	remaining: 11.9s
10:	learn: 0.6180512	test: 0.6350173	best: 0.6350173 (10)	total: 129ms	remaining: 11.6s
20:	learn: 0.5819445	test: 0.6085166	best: 0.6085166 (20)	total: 240ms	remaining: 11.2s
30:	learn: 0.5578892	test: 0.5939844	best: 0.5939844 (30)	total: 351ms	remaining: 11s
40:	learn: 0.5339282	test: 0.5780731	best: 0.5780731 (40)	total: 460ms	remaining: 10.8s
50:	learn: 0.5171074	test: 0.5676783	best: 0.5676783 (50)	total: 574ms	remaining: 10.7s
60:	learn: 0.5046735	test: 0.5607111	best: 0.5607111 (60)	total: 687ms	remaining: 10.6s
70:	learn: 0.4905787	test: 0.5519496	best: 0.5519496 (70)	total: 800ms	remaining: 10.5s
80:	learn: 0.4802720	test: 0.5452678	best: 0.5452678 (80)	total: 908ms	remaining: 10.3s
90:	learn: 0.4698093	test: 0.5380500	best: 0.5380500 (90)	total: 1.02s	remaining: 10.2s
100:	learn: 0.4608845	test: 0.5314064	best: 0.5314064 (100)	total: 1.14s	remaining: 10.1s
110

<catboost.core.CatBoostClassifier at 0x7f82419c7640>

In [42]:
y_pred = reranker.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred = reranker.predict_proba(X_test)[:, 1]
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         0.0       0.82      0.87      0.84       135
         1.0       0.71      0.63      0.67        70

    accuracy                           0.79       205
   macro avg       0.76      0.75      0.75       205
weighted avg       0.78      0.79      0.78       205

roc_auc_score: 0.8457142857142859


# Feature-wise + text features

In [43]:
train = pd.read_excel("../data/train/merged_executed.xlsx")
generated = pd.read_excel("../data/train/merged_executed_generated.xlsx")

In [44]:
real_y = np.ones(train.shape[0])
fake_y = np.zeros(generated.shape[0])

In [45]:
real_train_answer_embeddings = np.stack(train.author_comment.progress_apply(get_sentence_embedding).to_numpy())

real_train_answer_embeddings.shape, real_y.shape

100%|██████████| 347/347 [00:02<00:00, 159.73it/s]


((347, 768), (347,))

In [46]:
fake_train_answer_embeddings = np.stack(generated.author_comment.progress_apply(get_sentence_embedding).to_numpy())

fake_train_answer_embeddings.shape, fake_y.shape

100%|██████████| 676/676 [00:04<00:00, 161.77it/s]


((676, 768), (676,))

In [47]:
real_train_answer_texts = train["author_comment"].tolist()
fake_train_answer_texts = generated["author_comment"].tolist()

In [50]:
real_features = pd.DataFrame({i:real_train_answer_embeddings[:, i] for i in range(real_train_answer_embeddings.shape[1])})
fake_features = pd.DataFrame({i:fake_train_answer_embeddings[:, i] for i in range(fake_train_answer_embeddings.shape[1])})

real_features["author_comment"] = real_train_answer_texts
fake_features["author_comment"] = fake_train_answer_texts

In [51]:
X, y = real_features.append(fake_features), np.vstack([real_y.reshape(-1, 1), fake_y.reshape(-1, 1)])

  X, y = real_features.append(fake_features), np.vstack([real_y.reshape(-1, 1), fake_y.reshape(-1, 1)])


In [53]:
X.shape, y.shape

((1023, 769), (1023, 1))

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [55]:
reranker = CatBoostClassifier(
    random_state=42,
    verbose=10
)

In [56]:
reranker.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    text_features=["author_comment"]
)

Learning rate set to 0.030155
0:	learn: 0.6810169	test: 0.6833745	best: 0.6833745 (0)	total: 31.6ms	remaining: 31.6s
10:	learn: 0.5897919	test: 0.6229006	best: 0.6229006 (10)	total: 292ms	remaining: 26.2s
20:	learn: 0.5355024	test: 0.5908931	best: 0.5908931 (20)	total: 554ms	remaining: 25.8s
30:	learn: 0.4820936	test: 0.5611558	best: 0.5611558 (30)	total: 831ms	remaining: 26s
40:	learn: 0.4430988	test: 0.5397745	best: 0.5397745 (40)	total: 1.1s	remaining: 25.7s
50:	learn: 0.4178252	test: 0.5295141	best: 0.5295141 (50)	total: 1.35s	remaining: 25.2s
60:	learn: 0.3846432	test: 0.5153070	best: 0.5153070 (60)	total: 1.66s	remaining: 25.5s
70:	learn: 0.3614731	test: 0.5074104	best: 0.5074104 (70)	total: 1.99s	remaining: 26.1s
80:	learn: 0.3438507	test: 0.5018554	best: 0.5018116 (79)	total: 2.33s	remaining: 26.4s
90:	learn: 0.3250992	test: 0.4946055	best: 0.4946055 (90)	total: 2.66s	remaining: 26.6s
100:	learn: 0.3115149	test: 0.4912510	best: 0.4912510 (100)	total: 3.05s	remaining: 27.2s
110:

<catboost.core.CatBoostClassifier at 0x7f824172fee0>

In [57]:
y_pred = reranker.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred = reranker.predict_proba(X_test)[:, 1]
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         0.0       0.81      0.86      0.83       135
         1.0       0.69      0.60      0.64        70

    accuracy                           0.77       205
   macro avg       0.75      0.73      0.74       205
weighted avg       0.77      0.77      0.77       205

roc_auc_score: 0.8455555555555556
