In [2]:
import json

In [3]:
import os
language = 'java'

data_file = os.path.join(language, 'with_score.json')
data = json.load(open(data_file))

In [4]:
all_problems = {}
data_path = os.path.abspath('../raw_data')
contests = os.listdir(data_path)
for c in contests:
    contest_path = os.path.join(data_path, c)
    problems = os.listdir(contest_path)
    for p in problems:
        key = c + "_" + p
        all_problems[key] = os.path.join(contest_path, p)

In [5]:
train_problems, valid_problems, test_problems = {
    'score': data['train_problems'],
    'no_score': []
}, {
    'score': data['val_problems'],
    'no_score': [],
}, {
    'score': data['test_problems'],
    'no_score': []
},

In [6]:
print(len(train_problems['score']), len(valid_problems['score']), len(test_problems['score']))

68 9 9


In [7]:
with_score_problem_ids = data['train_problems'] + data['val_problems'] + data['test_problems']
all_problem_ids = list(all_problems.keys())
wo_score_problem_ids = list(set(all_problem_ids).difference(with_score_problem_ids))

print(len(all_problem_ids), len(with_score_problem_ids), len(wo_score_problem_ids))

361 86 275


In [8]:
import numpy as np
np.random.seed(4000)

def divide_problems(data):
    np.random.shuffle(data)
    l = len(data)
    test = int(np.ceil(l * 0.1))
    valid = test
    return data[(test+valid):], data[test:(test+valid)], data[:test]

trwo, vwo, two = divide_problems(wo_score_problem_ids)

print(len(trwo), len(vwo), len(two))
train_problems['no_score'] = trwo
valid_problems['no_score'] = vwo
test_problems['no_score'] = two

problem_ids = {
    'train': train_problems,
    'valid': valid_problems,
    'test': test_problems
}

with open(os.path.join(language, 'problem_ids.json'), 'w') as f:
    json.dump(problem_ids, f, indent=4)
    f.close()

219 28 28


In [9]:
without_score_folder = os.path.join(language, "wo_score")
os.makedirs(without_score_folder, exist_ok=True)

with_score_folder = os.path.join(language, 'with_score')
os.makedirs(with_score_folder, exist_ok=True)

full_score_folder = os.path.join(language, 'full_score')
os.makedirs(full_score_folder, exist_ok=True)

train_with, valid_with, test_with = data['train_data'], data['val_data'], data['test_data']

In [10]:
for name, data in [
    ('train', train_with), 
    ('valid', valid_with), 
    ('test', test_with)
]:
    print(name)
    data_file = open(os.path.join(with_score_folder, name + ".jsonl"), "w")
    for d in data:
        d['code'] = d['base_sample_code']
        d.pop('base_sample_code')
        data_file.write(json.dumps(d) + "\n")
    data_file.close()

train
valid
test


In [11]:
from typing import List, Dict, Union, Tuple
import copy


def read_files(
    directory: str,
    ext: str
):
    return {
        p: open(
            os.path.join(directory, p)
        ).read() for  p in os.listdir(directory) if p.endswith(ext)
    }

def sample_code(
    pid: str,
    samples: Union[Dict[str, str], List[Tuple[str, Dict[str, str]]]],
    n: int,
    score: float = 0.
):
    modified_samples = {}
    if isinstance(samples, List):
        for nid, nsamples in samples:
            for p in nsamples.keys():
                modified_samples[nid + "_" + p] = nsamples[p]
    else:
        for p in samples:
            modified_samples[pid + "_" + p] = samples[p]
    samples_keys = list(modified_samples.keys())
    if len(samples_keys) == 0:
        return []
    if len(samples_keys) < n:
        chosen_keys = samples_keys
    else:
        chosen_keys = np.random.choice(samples_keys, size=n)
    return [
        {
            'code': modified_samples[k],
            'comparison_sample_name': k,
            'score': score
        } for k in chosen_keys
    ]

def prepare_without_score(
    problem_to_path: Dict[str, str], 
    problem_ids: List[str], 
    src_ext: str, 
    tgt_ext:str,
    n: int
):
    data = []
    pid_to_code = {
        p: {
            src_ext: read_files(
                directory=problem_to_path[p], 
                ext=src_ext
            ),
            tgt_ext: read_files(
                directory=problem_to_path[p], 
                ext=tgt_ext
            )
        } for p in problem_ids
    }
    pos_lengths = []
    neg_lengths = []
    for pid in problem_ids:
        source_codes = pid_to_code[pid][src_ext]
        positive_samples = pid_to_code[pid][tgt_ext]
        negative_pids = copy.copy(problem_ids)
        negative_pids.remove(pid)
        negative_samples = [(nid, pid_to_code[nid][tgt_ext]) for nid in negative_pids]
        for c in source_codes.keys():
            source_code = source_codes[c]
            base_sample_name = pid + '_' + c
            positive_codes = sample_code(pid, positive_samples, n=n)
            pos_lengths.append(len(positive_codes))
            negative_codes = sample_code(pid, negative_samples, n=n)
            neg_lengths.append(len(negative_codes))
            data.append({
                'base_sample_name': base_sample_name,
                'code': source_code,
                'positives': positive_codes,
                'negatives': negative_codes
            })
    print(f"Positive Codes: {round(np.min(pos_lengths).item(), 2)}\t{round(np.mean(pos_lengths).item(), 2)}\t{round(np.max(pos_lengths).item(), 2)}")
    print(f"Negative Codes: {round(np.min(neg_lengths).item(), 2)}\t{round(np.mean(neg_lengths).item(), 2)}\t{round(np.max(neg_lengths).item(), 2)}")
    print("=" * 100)
    return data
    
train_wo = prepare_without_score(
    problem_to_path=all_problems,
    problem_ids=train_problems['no_score'],
    src_ext='py' if language=='python' else 'java',
    tgt_ext='py' if language=='java' else 'java',
    n=5
)

valid_wo = prepare_without_score(
    problem_to_path=all_problems,
    problem_ids=valid_problems['no_score'],
    src_ext='py' if language=='python' else 'java',
    tgt_ext='py' if language=='java' else 'java',
    n=5
)

test_wo = prepare_without_score(
    problem_to_path=all_problems,
    problem_ids=test_problems['no_score'] + test_problems['score'],
    src_ext='py' if language=='python' else 'java',
    tgt_ext='py' if language=='java' else 'java',
    n=1000000
)
print(len(train_wo), len(valid_wo), len(test_wo)) 

Positive Codes: 0	4.83	5
Negative Codes: 5	5.0	5
Positive Codes: 0	4.91	5
Negative Codes: 5	5.0	5
Positive Codes: 0	110.79	202
Negative Codes: 1950	2041.21	2152
7455 1448 1698


In [12]:
for name, data in [
    ('train', train_wo), 
    ('valid', valid_wo), 
    ('test', test_wo)
]:
    print(name)
    data_file = open(os.path.join(without_score_folder, name + ".jsonl"), "w")
    for d in data:
        data_file.write(json.dumps(d) + "\n")
    data_file.close()

train
valid
test


In [13]:
train = train_wo + train_with
valid = valid_wo + valid_with
np.random.shuffle(train)
np.random.shuffle(valid)
for name, data in [
    ('train', train), 
    ('valid', valid), 
    ('test', test_wo)
]:
    print(name)
    data_file = open(os.path.join(full_score_folder, name + ".jsonl"), "w")
    for d in data:
        data_file.write(json.dumps(d) + "\n")
    data_file.close()

train
valid
test
