In [1]:
import os
import re
import random
import pickle
import numpy as np
import pandas as pd
import torch
import lightning as L

from glob import glob
from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, losses
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances

In [2]:
torch.set_float32_matmul_precision('high')

In [3]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator
import csv
class CustomEvaluator(BinaryClassificationEvaluator):
    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        scores = self.compute_metrices(model)
        main_score = max(scores[short_name]["accuracy"] for short_name in scores)
        file_output_data = [epoch, steps]
        for header_name in self.csv_headers:
            if "_" in header_name:
                sim_fct, metric = header_name.split("_", maxsplit=1)
                file_output_data.append(scores[sim_fct][metric])
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow(file_output_data)
            else:
                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(file_output_data)
        return main_score

In [4]:
SEED = 42
N_SPLIT = 5
BATCH_SIZE = 48
EPOCHS = 5

In [5]:
L.seed_everything(SEED)

Seed set to 42


42

In [6]:
train_df = pd.read_csv('sample_train.csv')

In [7]:
train_df['code1_problem'] = train_df['code1_path'].apply(lambda x: int(os.path.basename(x).split('_')[0].split('problem')[1]))
train_df['code2_problem'] = train_df['code2_path'].apply(lambda x: int(os.path.basename(x).split('_')[0].split('problem')[1]))

In [8]:
# label_texts = defaultdict(list)
# code_paths = glob('train_code/*/*.cpp')
# for code_path in tqdm(code_paths):
#     code_basename = os.path.basename(code_path)
#     label = int(code_basename.split('_')[0].split('problem')[1])
#     with open(code_path, 'r', encoding='utf-8') as f:
#         code_text = f.read()
#     label_texts[label].append(code_text)
# with open('./preproc/label_texts.pkl', 'wb') as f:
#     pickle.dump(label_texts, f)
    
with open('./preproc/label_texts.pkl', 'rb') as f:
    label_texts = pickle.load(f)

In [9]:
labels = np.array(list(label_texts.keys()))

In [10]:
kf = KFold(n_splits=N_SPLIT,shuffle=True, random_state=SEED)
for fold_idx, (train_index, val_index) in enumerate(kf.split(labels)):
    # fold마다 oom이 발생해서 커널을 다시 시작해야합니다.
    # if fold!=0: continue
    labels_train_fold = labels[train_index]
    labels_val_fold = labels[val_index]
    val_df = train_df[train_df['code1_problem'].isin(labels_val_fold) & train_df['code2_problem'].isin(labels_val_fold)]
    label_min = min((val_df['similar'] == 0).sum(),(val_df['similar'] == 1).sum())
    val_df = pd.concat([val_df[val_df['similar']==0].sample(label_min),val_df[val_df['similar']==1].sample(label_min)],axis=0)

    train_examples = []
    for label_train in labels_train_fold:
        for code_text in label_texts[label_train]:
            train_examples.append(InputExample(texts=[code_text], label=label_train))
    
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
    
    val_evaluator = CustomEvaluator(
        sentences1=val_df['code1'].values.tolist(), 
        sentences2=val_df['code2'].values.tolist(), 
        labels=val_df['similar'].values.tolist(),
        batch_size=BATCH_SIZE,
        show_progress_bar=True,
        write_csv=True,
    )

    model = SentenceTransformer('microsoft/codereviewer')
    model.forward = torch.compile(model.forward, mode="reduce-overhead")
    train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
    model.fit(
        use_amp=True,
        train_objectives=[(train_dataloader, train_loss)],
        epochs=EPOCHS,
        warmup_steps=len(train_examples)//BATCH_SIZE,
        save_best_model=True,
        evaluator=val_evaluator,
        output_path=f'./checkpoints/codereviewer-{fold_idx=}',
    )


No sentence-transformers model found with name microsoft/codereviewer. Creating a new one with MEAN pooling.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4167 [00:00<?, ?it/s]

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4167 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4167 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [10]:
test_df = pd.read_csv('test.csv')
sentences1, sentences2 = test_df['code1'].values.tolist(), test_df['code2'].values.tolist()
sentences = list(set(sentences1 + sentences2))

preds = []
for fold_idx in range(5):
    model = SentenceTransformer(f'./checkpoints/codereviewer-{fold_idx=}')
    embeddings = model.encode(
        sentences, batch_size=BATCH_SIZE, show_progress_bar=True, convert_to_numpy=True
    )
    emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
    embeddings1 = [emb_dict[sent] for sent in sentences1]
    embeddings2 = [emb_dict[sent] for sent in sentences2]
    
    score_names = ['cossim_accuracy','manhattan_accuracy','euclidean_accuracy','dot_accuracy']
    eval = pd.read_csv(f'./checkpoints/codereviewer-{fold_idx=}/eval/binary_classification_evaluation_results.csv')
    max_score_name = score_names[eval[score_names].max().argmax()]
    max_score_threshold = eval.iloc[eval[score_names].max(1).values.argmax()][f"{max_score_name}_threshold"]
    
    if max_score_name == 'cossim_accuracy':
        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        pred = (cosine_scores>max_score_threshold) * 1
    elif max_score_name == 'manhattan_accuracy':
        manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
        pred = (manhattan_distances<max_score_threshold) * 1
    elif max_score_name == 'euclidean_accuracy':
        euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
        pred = (euclidean_distances<max_score_threshold) * 1
    elif max_score_name == 'dot_accuracy':
        embeddings1_np = np.asarray(embeddings1)
        embeddings2_np = np.asarray(embeddings2)
        dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
        pred = (dot_scores>max_score_threshold) * 1
    else:
        raise ValueError
    preds.append(pred)
preds = np.array(preds)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/4758 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/4758 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/4758 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/4758 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Batches:   0%|          | 0/4758 [00:00<?, ?it/s]

In [11]:
submission = pd.read_csv('sample_submission.csv')

In [12]:
submission['similar'] = (np.mean(preds,0)>0.5)*1

In [13]:
submission['similar'].mean()

0.4120050420168067

In [14]:
submission.to_csv('./test_submission.csv', index=False)