# Train, Inference

## 학습에 사용된 아이디어
1. [GraphCodeBert](https://github.com/microsoft/CodeBERT) - pretrained model 사용
2. [Label smoothing](https://arxiv.org/pdf/1906.02629.pdf)
3. tokenize시 truncation 방향을 right에서 left로 변경

In [1]:
# !pip install transformers

In [1]:
import os
import logging
import random
import sys

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
import torch.nn.functional as f
from torch.utils.data import TensorDataset

from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss

import numpy as np
import pandas as pd
from matplotlib import rc
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, f1_score, recall_score, precision_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

import transformers
transformers.logging.set_verbosity_error()

In [4]:
print("Load Tokenizer")
tokenizer = RobertaTokenizer.from_pretrained('microsoft/graphcodebert-base')
tokenizer.truncation_side = 'left'

Load Tokenizer


In [5]:
from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig
import torch.nn as nn

config = RobertaConfig.from_pretrained('microsoft/graphcodebert-base')

pretrained = RobertaForSequenceClassification.from_pretrained('microsoft/graphcodebert-base')
pretrained

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

# 데이터 처리

In [6]:
def text_to_wordlist(text):
    text = text.split('\n')

    new_text = []
    for line in text:
        new_text.append(line)

    text = '\n'.join(new_text)
    return text

In [7]:
def create_encoding(df):
    test_encoded_dict = tokenizer(
        df['code1'].tolist(),
        df['code2'].tolist(),

        max_length = 512,
        truncation = True,
        return_token_type_ids = True,
        padding = "max_length",
        return_tensors = 'pt'
      )
    
    return test_encoded_dict

# 데이터셋 만들기

In [8]:
import json

url_to_code={}
jsonl_file = './clonedetection/dataset/data.jsonl'
with open(jsonl_file) as f:
    for line in f:
        lin = line.strip()
        js = json.loads(line)
        url_to_code[js['idx']]=js['func']

In [9]:
d = []
with open('./clonedetection/dataset/train-0.txt') as f:
    for line in f:
        line = line.strip()
        url1, url2, label = line.split('\t')
        d.append({
            'code1': url_to_code[url1],
            'code2': url_to_code[url2],
            'similar': int(label)
        })
train_data = pd.DataFrame.from_records(d)
train_data

Unnamed: 0,code1,code2,similar
0,import sys read = sys.stdin.read readline = sy...,"N, K = map(int, input().split()) A = list(map(...",1
1,"r,c = map(int,input().split()) a = [list(map(i...",''' ITP-1_7-C ??¨?¨???? ??¨?¨??????????????°?...,1
2,import sys input = lambda: sys.stdin.readline(...,"n, k = map(int, input().split()) a = list(map(...",1
3,"n = int(input()) a=list(map(int,input().split(...","n = int(input()) a = list(map(int, input().spl...",1
4,n = int(input()) print(10 - n // 200),x = int(input()) x //= 200 print(10 - x),1
...,...,...,...
680866,i = int(input()) x = 100000 for _ in range(i):...,"def mlt(): return map(int, input().split()) x,...",0
680867,import sys import itertools import collections...,import sys import math import heapq import bis...,1
680868,"def add(x,y): return x+y def sub(x,y): ...","ope = {""+"": lambda a, b: b + a, \t ""-"": lamb...",1
680869,"cards = { 'S':[r for r in range(1,13+1)], ...","import sys SUITS = ['S', 'H', 'C', 'D'] exist_...",1


In [10]:
d = []
with open('./clonedetection/dataset/valid-0.txt') as f:
    for line in f:
        line = line.strip()
        url1, url2, label = line.split('\t')
        d.append({
            'code1': url_to_code[url1],
            'code2': url_to_code[url2],
            'similar': int(label)
        })
valid_data = pd.DataFrame.from_records(d)
valid_data

Unnamed: 0,code1,code2,similar
0,"def selectionSort(arr, N): counter = 0 ...","def insertion_sort(A, n, g): global cnt ...",0
1,"a,b,c = [int(i) for i in input().split()] res ...","a, b, c = map(int, raw_input().split()) print ...",0
2,"N = int(input()) A = list(map(int,input().spli...",i=0 while True: i+=1 x = int(input()) ...,0
3,def f(n): cnt = 0 while n: ...,import sys N = int(sys.stdin.readline().strip(...,1
4,"n = int(input()) a = list(map(int, input().spl...","N = int(input()) A_list = list(map(int, input(...",1
...,...,...,...
47143,"N,K = map(int,input().split()) import numpy as...","from sys import stdin n, k = [int(x) for x in ...",1
47144,"n,m,k=map(int,input().split()) mod1,mod2=10**9...","import numpy as np N, M, K = [int(_) for _ in ...",1
47145,global graph global seen token=0 def dfs(i): ...,import collections d=collections.deque() for _...,0
47146,"a, b = [int(tem) for tem in input().split()] d...","x = int(input("""")) if (x >= 1) or (x <= 100): ...",0


In [11]:
train_data['similar'] = train_data['similar'].astype('category')
train_data['code1'] = train_data['code1'].apply(lambda x: text_to_wordlist(x))
train_data['code2'] = train_data['code2'].apply(lambda x: text_to_wordlist(x))

valid_data['similar'] = valid_data['similar'].astype('category')
valid_data['code1'] = valid_data['code1'].apply(lambda x: text_to_wordlist(x))
valid_data['code2'] = valid_data['code2'].apply(lambda x: text_to_wordlist(x))

In [None]:
encoded_dict_train = create_encoding(train_data)
encoded_dict_valid = create_encoding(valid_data)

In [None]:
vals_train = train_data['similar'].tolist()
vals_train = torch.Tensor(vals_train)

vals_valid = valid_data['similar'].tolist()
vals_valid = torch.Tensor(vals_valid)

train_dataset = TensorDataset(encoded_dict_train['input_ids'], encoded_dict_train['token_type_ids'], vals_train)
valid_dataset = TensorDataset(encoded_dict_valid['input_ids'], encoded_dict_valid['token_type_ids'], vals_valid)

In [None]:
print('{:>5,} training samples'.format(len(train_dataset)))
print('{:>5,} validation samples'.format(len(valid_dataset)))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size,
            pin_memory=True
        )

validation_dataloader = DataLoader(
            valid_dataset, 
            sampler = SequentialSampler(valid_dataset), 
            batch_size = batch_size,
            pin_memory=True
        )

# 모델 설정

In [None]:
weight=torch.tensor([0.64,0.36]).to(device)
loss_fct = CrossEntropyLoss(weight=weight, label_smoothing=0.05)

In [None]:
class BertModelModified(nn.Module):
    def __init__(self, my_pretrained_model):
        super(BertModelModified, self).__init__()
        self.pretrained = my_pretrained_model

    def forward(self, input_ids=None, token_type_ids=None, labels=None): 
        outputs = self.pretrained( 
            input_ids,
            token_type_ids=token_type_ids,
            labels = labels
        )
        out = outputs[1]
        loss = loss_fct(outputs[1].view(-1, 2), labels.view(-1))
        return out, loss

In [None]:
model = BertModelModified(my_pretrained_model=pretrained)
model.to(device)

# 학습

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1.04e-5, eps=1e-8)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import random
import numpy as np
import torch.nn as nn

seed_val = 12345
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 예시를 위해서 1로 설정
epochs = 4

logging_steps = len(train_dataloader) // 100
save_steps = len(train_dataloader) // 4

for epoch_i in range(epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_train_loss = 0
    model.train()
    bar = tqdm(train_dataloader)
    for step, batch in enumerate(bar):
        # batch의 구성이 input_id, token_type_id, label로 구성이 되어있으므로
        # 순서대로 받아온 다음에 모델에 입력
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_labels = batch[2].long().to(device)

        model.zero_grad()

        logits, loss = model(
            input_ids=b_input_ids,
            token_type_ids=b_token_type_ids,
            labels=b_labels
        )

        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        avg_train_loss = total_train_loss / len(train_dataloader)
        bar.set_description("epoch {} loss {:.5f}".format(epoch_i, avg_train_loss))
        if logging_steps % len(train_dataloader) == 0 and not step == 0:
            print('step : {:>5,} of {:>5,} loss: {:.5f}'.format(step, len(train_dataloader), loss.item()))
        if step % save_steps == 0 and not step == 0:
            print("")
            print("  Average training loss: {0:.5f}".format(avg_train_loss))

            # Validation
            print("")
            print("Running Validation...")

            model.eval()
            total_eval_accuracy = 0
            total_eval_loss = 0
            nb_eval_steps = 0

            y_logits = []
            y_trues = []
            for step, batch in enumerate(validation_dataloader):
                b_input_ids = batch[0].to(device)
                b_token_type_ids = batch[1].to(device)
                b_labels = batch[2].long().to(device)

                with torch.no_grad():
                    logits, loss = model(
                        input_ids=b_input_ids,
                        token_type_ids=b_token_type_ids,
                        labels=b_labels
                    )
                    total_eval_loss += loss.item()
                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.to('cpu').numpy()
                    total_eval_accuracy += flat_accuracy(logits, label_ids)
                    y_logits.append(logits)
                    y_trues.append(label_ids)
            y_logits = np.concatenate(y_logits, 0)
            y_trues = np.concatenate(y_trues, 0)
            y_preds = y_logits[:, 1] > 0.5
            f1 = f1_score(y_trues, y_preds)
            recall = recall_score(y_trues, y_preds)
            precision = precision_score(y_trues, y_preds)
            avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

            print("Accuracy: {0:.2f}".format(avg_val_accuracy))
            print('f1: {:.3f}'.format(f1))
            print('precision: {:.3f}'.format(precision))
            print('recall: {:.3f}'.format(recall))

            path = './graphcodebert/model-{}-{:.4f}-{:.3f}.pt'.format(str(epoch_i + 1), avg_train_loss, f1)
            torch.save(model.state_dict(), path)

# Submission

In [22]:
testData = pd.read_csv('./data/test.csv')
testData['code1'] = testData['code1'].apply(lambda x: text_to_wordlist(x))
testData['code2'] = testData['code2'].apply(lambda x: text_to_wordlist(x))

In [23]:
encoded_dict_test = create_encoding(testData)

In [24]:
vals = [1]*len(testData)
vals = torch.Tensor(vals)

test_dataset = TensorDataset(encoded_dict_test['input_ids'], encoded_dict_test['token_type_ids'], vals)

In [31]:
batch_size = 10

test_dataloader = DataLoader(
            test_dataset, 
            sampler = SequentialSampler(test_dataset), 
            batch_size = batch_size
        )

In [32]:
model.load_state_dict(torch.load('./graphcodebert/model-4-0.1329-0.968.pt'))

<All keys matched successfully>

In [33]:
print("Running test...")

results = []    

model.eval()
total_eval_loss = 0

for step, batch in enumerate(tqdm(test_dataloader)):
    b_input_ids = batch[0].to(device)
    b_token_type_ids=batch[1].to(device)
    b_labels = batch[2].long().to(device)

    with torch.no_grad():
        logits, loss = model(
            input_ids=b_input_ids, token_type_ids=b_token_type_ids, labels=b_labels
          )
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        results.extend(logits)
print(len(results))

Running test...


100%|█████████████████████████████████████| 17970/17970 [37:09<00:00,  8.06it/s]

179700





In [34]:
testData['similar'] = np.argmax(results, axis=1)

In [35]:
testData

Unnamed: 0,pair_id,code1,code2,similar
0,1,def main():\n s = input()\n if s.count('a') ...,"N,K = map(int,input().split())\nA = list(map(i...",0
1,2,"N,K,Q = map(int,input().split())\npoints = [0]...","N, K, Q = map(int,input().split())\n\nif K > Q...",1
2,3,from itertools import combinations\nn = int(in...,s = input()\nt = input()\nlength_s = len(s)\nl...,0
3,4,"a,b=map(int,input().split())\n\nans1=a+b\nans2...","a, b, c, d = map(int,input().split())\n\nif a ...",0
4,5,S = input()\nK = int(input())\n\nind = -1\nfor...,"H, W = map(int, input().split())\ngrid = []\nf...",0
...,...,...,...,...
179695,179696,N = int(input())\nS = input()\nS_rep = S.repla...,N = int(input())\nS = list(input())\n\nrow = [...,1
179696,179697,import sys\ndef input(): return sys.stdin.read...,"h,w,a,b = (int(i) for i in input().split())\nf...",1
179697,179698,"a = list(map(int, input().split()))\n\na1 = ab...","print('YNEOS'[sorted(input().split())!=['1','4...",0
179698,179699,"\n\nH,W,A,B = map(int,input().split())\n\ns = ...","h, w, a, b = map(int, input().split())\nmat = ...",1


In [36]:
testData.to_csv('./graphcodebert/submission-e4.csv', columns=['pair_id', 'similar'], index=False)