## Import

In [23]:
import pandas as pd
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup, AutoModelForMaskedLM
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from itertools import combinations
from rank_bm25 import BM25L

import torch
import torch.nn as nn
import random
import time
import datetime
import os, re
import argparse

## Functions

In [24]:
''' 아무 내용이 없는 줄은 제거합니다. '''
def get_rid_of_empty(c):
    ret = []
    splitted = c.split('\n')
    for s in splitted:
        if len(s.strip()) > 0:
            ret.append(s)
    return '\n'.join(ret)

In [25]:
''' 데이터 클리닝 '''
def clean_data(script, data_type="dir"):
    if data_type == "dir":
        with open(script, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            preproc_lines = []
            for line in lines:
                if line.lstrip().startswith('//'):
                    continue
                line = line.rstrip()
                if '//' in line:
                    line = line[:line.index('//')]
                line = line.replace('\n', '')
                line = line.replace('    ', '\t')
                if line == '':
                    continue
                preproc_lines.append(line)

    elif data_type == "file":
        # Split the script into lines
        lines = script.split('\n')
        preproc_lines = []
        for line in lines:
            # Skip lines that start with '#'
            if line.lstrip().startswith('//'):
                continue
            line = line.rstrip()
            # Remove comments after '#'
            if '//' in line:
                line = line[:line.index('//')]
            line = line.replace('\n', '')
            line = line.replace('    ', '\t')
            # Skip empty lines
            if line == '':
                continue
            preproc_lines.append(line)

    # Join the preprocessed lines into a single string
    preprocessed_script = '\n'.join(preproc_lines)
    
    # Remove /* and */ and their contents
    preprocessed_script = re.sub(r'/\*.*?\*/', "", preprocessed_script, flags=re.DOTALL)
    
    # Remove single quoted strings
    preprocessed_script = re.sub(r'\'\w+', '', preprocessed_script)
    
    # Remove alphanumeric words
    preprocessed_script = re.sub(r'\w*\d+\w*', '', preprocessed_script)
    
    # Replace multiple spaces with a single space
    preprocessed_script = re.sub(r'\s{2,}', ' ', preprocessed_script)
    
    # Remove spaces around non-word characters
    preprocessed_script = re.sub(r'\s[^\w\s]\s', '', preprocessed_script)

    ''' 극소수지만 데이터 몇개는 완성되지 않은 주석들이 있었습니다 '''
    splitted = preprocessed_script.split('\n')

    found_triple = False
    start_idx, end_idx = -1, -1
    for i in range(len(splitted)):
        if found_triple == False and '/*' in splitted[i]:
            found_triple = True
            start_idx = i
        elif found_triple == True and '*/' in splitted[i]:
            end_idx = i
    if start_idx != -1 and end_idx != -1:
        splitted = splitted[:start_idx] + splitted[end_idx + 1:]
    elif start_idx != -1 and end_idx == -1:
        splitted = splitted[start_idx + 1:]

    preprocessed_script = '\n'.join(splitted)
    preprocessed_script = get_rid_of_empty(preprocessed_script)
        
    return preprocessed_script



In [26]:
''' positive, negative 페어 생성 함수 '''
def get_pairs(input_df, tokenizer):
    codes = input_df['code'].to_list()
    problems = input_df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25L(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = input_df[input_df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(input_df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)
    return pair_data


In [27]:
def set_seed():
    random.seed(42)
    os.environ['PYTHONHASHSEED'] = str(42)
    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)

In [28]:
# 데이콘이 제공해준 학습 코드 데이터 데이터프레임 만들기
code_folder = "./train_code"  # 데이콘이 제공해준 학습 데이터 파일의 경로
problem_folders = os.listdir(code_folder)
preproc_scripts = []
problem_nums = []

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder))
    problem_num = scripts[0].split('_')[0]
    for script in scripts:
        script_file = os.path.join(code_folder, problem_folder, script)
        preprocessed_script = clean_data(script_file, data_type="dir")
        preproc_scripts.append(preprocessed_script)
    problem_nums.extend([problem_num] * len(scripts))
train_df = pd.DataFrame(data={'code': preproc_scripts, 'problem_num': problem_nums})

# 데이콘이 제공해준 테스트 코드 데이터 데이터프레임 만들기
test_df = pd.read_csv("test.csv")
code1 = test_df['code1'].values
code2 = test_df['code2'].values
processed_code1 = []
processed_code2 = []
for i in tqdm(range(len(code1))):
    processed_c1 = clean_data(code1[i], data_type="file")
    processed_c2 = clean_data(code2[i], data_type="file")
    processed_code1.append(processed_c1)
    processed_code2.append(processed_c2)
processed_test = pd.DataFrame(list(zip(processed_code1, processed_code2)), columns=["code1", "code2"])

# 데이터 프레임을 만들었으니 이제 train/val split을 진행하고, positive, negative pairs를 생성합니다.
# 청소님의 코드를 참고해서 hard negative pair를 생성하였으며, BM25대신 BM25L을 사용합니다.
# (BM25, BM25L 모두 테스트한 결과 BM25L에서 더 좋은 성능을 보였습니다.)
# tokenizer는 왼쪽부터 truncation을 진행하여 truncation이 필요할때는 코드의 끝 부분들을 이용하게 만듭니다.

dacon_train_df, dacon_valid_df, dacon_train_label, dacon_valid_label = train_test_split(
    train_df,
    train_df['problem_num'],
    random_state=42,
    test_size=0.1
)

dacon_train_df = dacon_train_df.reset_index(drop=True)
dacon_valid_df = dacon_valid_df.reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

dacon_train_bm25L = get_pairs(dacon_train_df, tokenizer)
dacon_valid_bm25L = get_pairs(dacon_valid_df, tokenizer)

100%|██████████| 500/500 [00:49<00:00, 10.06it/s]
100%|██████████| 595000/595000 [01:43<00:00, 5736.94it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (984 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 500/500 [2:25:11<00:00, 17.42s/it]  
100%|██████████| 500/500 [12:23<00:00,  1.49s/it]


In [29]:
# 생성된 데이터를 저장합니다. => 이 과정까지의 생성 시간이 꽤 오래걸립니다.
dacon_train_bm25L.to_csv("train_bm25L.csv", index=False)
dacon_valid_bm25L.to_csv("valid_bm25L.csv", index=False)
processed_test.to_csv("processed_test.csv", index=False)

In [32]:
set_seed()

dacon_train_bm25L = pd.read_csv("train_bm25L.csv")
dacon_valid_bm25L = pd.read_csv("valid_bm25L.csv")

train_data = dacon_train_bm25L
valid_data = dacon_valid_bm25L

# training
c1 = train_data['code1'].values
c2 = train_data['code2'].values
similar = train_data['similar'].values

N = train_data.shape[0]
MAX_LEN = 512

input_ids = np.zeros((N, MAX_LEN), dtype=int)
attention_masks = np.zeros((N, MAX_LEN), dtype=int)
labels = np.zeros((N), dtype=int)

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
    
for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                    truncation=True)
        input_ids[i,] = encoded_input['input_ids']
        attention_masks[i,] = encoded_input['attention_mask']
        labels[i] = similar[i]
    except Exception as e:
        print(e)
        pass


# validating
c1 = valid_data['code1'].values
c2 = valid_data['code2'].values
similar = valid_data['similar'].values

N = valid_data.shape[0]

MAX_LEN = 512

valid_input_ids = np.zeros((N, MAX_LEN), dtype=int)
valid_attention_masks = np.zeros((N, MAX_LEN), dtype=int)
valid_labels = np.zeros((N), dtype=int)

for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=512, padding='max_length',
                                    truncation=True)
        valid_input_ids[i,] = encoded_input['input_ids']
        valid_attention_masks[i,] = encoded_input['attention_mask']
        valid_labels[i] = similar[i]
    except Exception as e:
        print(e)
        pass

if os.path.exists("graphcodebert"):
    os.makedirs("graphcodebert", exist_ok=True)

print("\n\nMake tensor\n\n")
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int)
labels = torch.tensor(labels, dtype=int)

valid_input_ids = torch.tensor(valid_input_ids, dtype=int)
valid_attention_masks = torch.tensor(valid_attention_masks, dtype=int)
valid_labels = torch.tensor(valid_labels, dtype=int)

save_tensor = True
if save_tensor == True:
    torch.save(input_ids, "" + 'train_input_ids_BM25L.pt')
    torch.save(attention_masks, "" + 'train_attention_masks_BM25L.pt')
    torch.save(labels, "" + 'train_labels_BM25L.pt')

    torch.save(valid_input_ids, "" + "valid_input_ids_BM25L.pt")
    torch.save(valid_attention_masks, "" + "valid_attention_masks_BM25L.pt")
    torch.save(valid_labels, "" + "valid_labels_BM25L.pt")


# Setup training
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

train_data = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

model = AutoModelForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
model.cuda()

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-5)

total_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device("cuda")
loss_f = nn.CrossEntropyLoss()

# Train
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []
model.zero_grad()
for i in range(3):
    print("")
    print('======== Epoch {:} / {:} ========'.format(i + 1, 3))
    print('Training...')
    t0 = time.time()
    train_loss, train_accuracy = 0, 0
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader), desc="Iteration", smoothing=0.05):
        if step % 10000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print('  current average loss = {}'.format(
                train_loss / step))  # bot.sendMessage(chat_id=chat_id, text = '  current average loss = {}'.format(train_loss / step))

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().cpu().numpy()
        train_accuracy += flat_accuracy(logits, label_ids)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
    avg_train_loss = train_loss / len(train_dataloader)
    avg_train_accuracy = train_accuracy / len(train_dataloader)
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)
    print("  Average training loss: {0:.8f}".format(avg_train_loss))
    print("  Average training accuracy: {0:.8f}".format(avg_train_accuracy))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Validating...")
    t0 = time.time()
    model.eval()
    val_loss, val_accuracy = 0, 0
    for step, batch in tqdm(enumerate(validation_dataloader), desc="Iteration", smoothing=0.05):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu()
        label_ids = b_labels.detach().cpu()
        val_loss += loss_f(logits, label_ids)

        logits = logits.numpy()
        label_ids = label_ids.numpy()
        val_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = val_accuracy / len(validation_dataloader)
    avg_val_loss = val_loss / len(validation_dataloader)
    val_accuracies.append(avg_val_accuracy)
    val_losses.append(avg_val_loss)
    print("  Average validation loss: {0:.8f}".format(avg_val_loss))
    print("  Average validation accuracy: {0:.8f}".format(avg_val_accuracy))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    # if np.min(val_losses) == val_losses[-1]:
    print("saving current best checkpoint")
    torch.save(model.state_dict(), "" + str(i + 1) + "_BM25L.pt")


MemoryError: Unable to allocate 385. GiB for an array with shape (100988696, 512) and data type int64

In [None]:

test_data = pd.read_csv("processed_test.csv")

c1 = test_data['code1'].values
c2 = test_data['code2'].values

N = test_data.shape[0]
MAX_LEN = 1024

test_input_ids = np.zeros((N, MAX_LEN), dtype=int)
test_attention_masks = np.zeros((N, MAX_LEN), dtype=int)

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = "left"

for i in tqdm(range(N), position=0, leave=True):
    try:
        cur_c1 = str(c1[i])
        cur_c2 = str(c2[i])
        encoded_input = tokenizer(cur_c1, cur_c2, return_tensors='pt', max_length=1024, padding='max_length',
                                    truncation=True)
        test_input_ids[i,] = encoded_input['input_ids']
        test_attention_masks[i,] = encoded_input['attention_mask']

    except Exception as e:
        print(e)
        pass

test_input_ids = torch.tensor(test_input_ids, dtype=int)
test_attention_masks = torch.tensor(test_attention_masks, dtype=int)

if save_tensor == True:
    torch.save(test_input_ids, "" + "test_input_ids.pt")
    torch.save(test_attention_masks, "" + "test_attention_masks.pt")

model = AutoModelForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
PATH = "" + "4" + "+_BM25L.pt"

model.load_state_dict(torch.load(PATH))
model.cuda()

test_tensor = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_tensor)
test_dataloader = DataLoader(test_tensor, sampler=test_sampler, batch_size=1048)

submission = pd.read_csv('sample_submission.csv')
device = torch.device("cuda")

preds = np.array([])
for step, batch in tqdm(enumerate(test_dataloader), desc="Iteration", smoothing=0.05):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu()
    _pred = logits.numpy()
    pred = np.argmax(_pred, axis=1).flatten()
    preds = np.append(preds, pred)

submission['similar'] = preds
submission.to_csv('sample_submission.csv', index=False)

In [None]:

# def model_ensemble():
#     submission = pd.read_csv('sample_submission.csv')

#     submission_1 = pd.read_csv('submission_graphcodebert_BM25L.csv')
#     submission_2 = pd.read_csv('submission_CodeBERTaPy_BM25L.csv')
#     submission_3 = pd.read_csv('submission_codebert_mlm_BM25L.csv')

#     sub_1 = submission_1['similar']
#     sub_2 = submission_2['similar']
#     sub_3 = submission_3['similar']

#     ensemble_preds = (sub_1 + sub_2 + sub_3) / 3

#     preds = np.where(ensemble_preds > 0.5, 1, 0)

#     submission['similar'] = preds

#     submission.to_csv('submission_ensemble_v2.csv', index=False)
