In [1]:
import pandas as pd 
import os 
import shutil
import random 
import glob 
from bs4 import BeautifulSoup
from tqdm import tqdm 

In [2]:
PATH_TO_CODENET = "/data1/shypula/Project_CodeNet"
PATH_TO_METADATA = "/data1/shypula/Project_CodeNet/metadata"
PATH_TO_PROBLEM_DESC = "/data1/shypula/Project_CodeNet/problem_descriptions_translated"
PATH_TO_DATA = "/data1/shypula/Project_CodeNet/data"
PATH_TO_TESTCASES = "/data1/shypula/codenet_testcases"
PATH_TO_PROBLEM_DESC_ORIG = "/data1/shypula/Project_CodeNet/problem_descriptions"


# problem desc is /{problem_id}.html
# metadata is /{problem_id}.csv
# data is /{problem_id}/LANG/{submission_id}.{ext}
# testcases is /{problem_id}/input.*.txt and /{problem_id}/output.*.txt

def html_string_to_text(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.get_text()


def print_n_from_dict(d, n=5, max_len=100):
    for i, (k, v) in enumerate(d.items()):
        if type(v) == str and len(v) > max_len:
            print(f"{k}: {v[:max_len]}...")
        else: 
            print(f"{k}: {v}")
        if i > n:
            break



In [3]:
problem_id_2_html = {}
problem_id_2_text = {}
import traceback

all_html_files = glob.glob(os.path.join(PATH_TO_PROBLEM_DESC, "*.html"))
for html_file in tqdm(all_html_files):
    problem_id = os.path.basename(html_file).replace(".html", "")
    with open(html_file, "r") as f:
        html_string = f.read()
        problem_id_2_html[problem_id] = html_string
        try: 
            problem_id_2_text[problem_id] = html_string_to_text(html_string)
        except Exception as e:
            problem_id_2_text[problem_id] = html_string
            print(f"Error processing {problem_id}")
            traceback.print_exc()
        

        
print_n_from_dict(problem_id_2_text, 2)
print_n_from_dict(problem_id_2_html, 2)

 84%|████████▍ | 3375/3999 [00:07<00:01, 536.09it/s]Traceback (most recent call last):
  File "/tmp/ipykernel_1921952/1050089094.py", line 12, in <cell line: 6>
    problem_id_2_text[problem_id] = html_string_to_text(html_string)
  File "/tmp/ipykernel_1921952/2456984889.py", line 15, in html_string_to_text
    soup = BeautifulSoup(html_string, 'html.parser')
  File "/home/shypula/anaconda3/envs/py39/lib/python3.9/site-packages/bs4/__init__.py", line 344, in __init__
    raise ParserRejectedMarkup(
bs4.builder.ParserRejectedMarkup: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 unknown status keyword 'C' in marked section
 87%|████████▋ | 3488/3999 [00:07<00:00, 543.27it/s]

Error processing p00194


100%|██████████| 3999/3999 [00:08<00:00, 450.01it/s]

p03466: 
Score : 1100 points


Problem StatementLet f(A, B), where A and B are positive integers, be the str...
p03036: 
Score : 200 points


Problem StatementThe development of algae in a pond is as follows.
Let the tot...
p02277: 
Quick Sort


Let's arrange a deck of cards. Your task is to sort totally n cards. A card consists o...
p03935: 

				Max Score: $1400$ Points 

Problem Statement
					E869120 defined a sequence $a$ like this: 

...
p03466: <span class="lang-en">
<p>Score : <var>1100</var> points</p>
<div class="part">
<section>
<h3>Proble...
p03036: <span class="lang-en">
<p>Score : <var>200</var> points</p>
<div class="part">
<section>
<h3>Problem...
p02277: 
<H1>Quick Sort</H1>
<!--
<p style="color:#f00">
Please note that problem description and judge data...
p03935: <span class="lang-en lang-child hidden-lang">
<div class="part">
				Max Score: $1400$ Points <br/>
...





In [4]:
problem_id_2_html_orig = {}
problem_id_2_text_orig = {}

all_html_files = glob.glob(os.path.join(PATH_TO_PROBLEM_DESC_ORIG, "*.html"))
for html_file in tqdm(all_html_files):
    problem_id = os.path.basename(html_file).replace(".html", "")
    with open(html_file, "r") as f:
        html_string = f.read()
        problem_id_2_html_orig[problem_id] = html_string
        problem_id_2_text_orig[problem_id] = html_string_to_text(html_string)
        

100%|██████████| 3999/3999 [00:08<00:00, 491.75it/s]


In [5]:
print(pd.Series(problem_id_2_text).apply(len).describe())
print(pd.Series(problem_id_2_html).apply(len).describe())

count     3999.000000
mean      1847.297324
std       1090.382192
min          0.000000
25%       1139.500000
50%       1643.000000
75%       2323.500000
max      20297.000000
dtype: float64
count     3999.000000
mean      2672.850963
std       1293.106508
min          1.000000
25%       1880.000000
50%       2529.000000
75%       3250.500000
max      27218.000000
dtype: float64


In [6]:
print(pd.Series(problem_id_2_text_orig).apply(len).describe())
print(pd.Series(problem_id_2_html_orig).apply(len).describe())

count     3999.000000
mean      1591.359840
std       1061.352577
min          1.000000
25%        947.000000
50%       1345.000000
75%       1915.000000
max      20297.000000
dtype: float64
count     3999.000000
mean      2506.493623
std       1319.754792
min          1.000000
25%       1699.000000
50%       2315.000000
75%       3050.000000
max      27218.000000
dtype: float64


In [7]:
import re

problem_id_to_inputs = {}
problem_id_to_outputs = {}


for problem_id in tqdm(problem_id_2_text):
    tc2input = {}
    tc2output = {}
    input_files = glob.glob(os.path.join(PATH_TO_TESTCASES, problem_id, "input.*.txt"))
    # output_files = glob.glob(os.path.join(PATH_TO_TESTCASES, problem_id, "output.*.txt"))
    if len(input_files) > 0:
        input_tc_nos = [re.search(r"input\.(\d+)\.txt", f).group(1) for f in input_files]
        for input_tc_no in input_tc_nos:
            input_tc_path = os.path.join(PATH_TO_TESTCASES, problem_id, f"input.{input_tc_no}.txt")
            output_tc_path = os.path.join(PATH_TO_TESTCASES, problem_id, f"output.{input_tc_no}.txt")
            assert os.path.exists(input_tc_path)
            assert os.path.exists(output_tc_path), f"Missing output file for {input_tc_path}"
            with open(input_tc_path, "r") as f:
                input_tc = f.read().strip()
            with open(output_tc_path, "r") as f:
                output_tc = f.read().strip()
            tc2input[input_tc_no] = input_tc
            tc2output[input_tc_no] = output_tc
    problem_id_to_inputs[problem_id] = tc2input
    problem_id_to_outputs[problem_id] = tc2output
    
print_n_from_dict(problem_id_to_inputs, 2)
print_n_from_dict(problem_id_to_outputs, 2)

100%|██████████| 3999/3999 [01:44<00:00, 38.18it/s]

p03466: {'12': '5\n2 3 1 5\n9 4 1 10\n2 3 6 4\n6 4 3 1\n8 2 5 8', '52': '5\n0 6 1 5\n6 4 1 2\n2 3 6 4\n6 4 3 2\n12 2 4 8', '28': '5\n2 3 1 5\n6 4 1 10\n2 3 4 4\n6 7 3 7\n8 20 5 14', '84': '5\n2 3 1 5\n6 4 2 10\n2 3 4 4\n6 7 5 0\n16 20 5 14', '26': '5\n2 3 1 5\n6 4 1 10\n2 3 4 4\n6 4 3 7\n8 20 5 14', '50': '5\n2 6 1 7\n9 4 1 10\n2 3 6 4\n6 4 3 7\n12 2 5 7', '11': '5\n2 3 1 5\n6 7 1 10\n2 3 4 4\n6 7 3 7\n8 10 5 14', '44': '5\n0 5 1 5\n6 4 2 3\n1 5 6 4\n8 4 3 1\n12 2 5 0', '30': '5\n2 6 1 7\n9 4 1 10\n2 3 6 4\n6 4 3 7\n12 2 5 8', '23': '5\n1 6 1 5\n1 4 1 2\n2 3 6 3\n6 4 3 7\n8 2 5 8', '62': '5\n3 3 1 5\n8 4 1 10\n4 3 4 4\n6 4 3 7\n8 2 5 8', '13': '5\n2 6 1 5\n9 4 1 2\n2 3 6 4\n6 4 3 7\n7 2 5 8', '74': '5\n3 3 1 5\n8 4 1 10\n4 3 4 4\n6 4 3 7\n8 2 10 8', '65': '5\n1 6 1 5\n17 4 1 2\n2 3 6 2\n6 4 3 7\n7 2 5 8', '63': '5\n2 3 1 5\n6 4 1 10\n2 3 4 4\n6 7 3 0\n16 20 5 14', '93': '5\n3 3 1 5\n18 4 1 10\n4 3 4 4\n6 4 3 7\n8 2 10 8', '40': '5\n0 6 1 5\n6 4 2 2\n2 3 6 4\n6 4 3 1\n12 0 5 8', '61': '




In [8]:
def filter_problem_ids(problem_id, problem_id_to_inputs, problem_id_2_text): 
    if problem_id not in problem_id_to_inputs:
        return False
    if len(problem_id_to_inputs[problem_id]) == 0:
        return False
    if len(problem_id_2_text[problem_id]) < 100:
        return False
    return True


valid_problem_ids = [problem_id for problem_id in problem_id_2_text if filter_problem_ids(problem_id, problem_id_to_inputs, problem_id_2_text)]
print(f"Number of valid problem ids: {len(valid_problem_ids)}")

Number of valid problem ids: 3898


In [9]:
# sample train, test and val
# train should be ~70-75 pct, test and val is the rest split in half 

random.seed(42)
random.shuffle(valid_problem_ids)
n = len(valid_problem_ids)
train_n = int(0.7 * n)
test_n = (n - train_n) // 2
val_n = n - train_n - test_n
train_problem_ids = valid_problem_ids[:train_n]
test_problem_ids = valid_problem_ids[train_n:train_n+test_n]
val_problem_ids = valid_problem_ids[train_n+test_n:]

print(f"Train: {len(train_problem_ids)}")
print(f"Test: {len(test_problem_ids)}")
print(f"Val: {len(val_problem_ids)}")

Train: 2728
Test: 585
Val: 585


In [10]:
# each record should be a dict with the following keys:
# description_html
# description_string
# input_testcases
# output_testcases
# orig_lang_html
# orig_lang_string

train_records = []
test_records = []
val_records = []

for ids, records in [(train_problem_ids, train_records), (test_problem_ids, test_records), (val_problem_ids, val_records)]:
    for problem_id in ids:
        record = {}
        record["description_html"] = problem_id_2_html[problem_id]
        record["description_string"] = problem_id_2_text[problem_id]
        record["input_testcases"] = problem_id_to_inputs[problem_id]
        record["output_testcases"] = problem_id_to_outputs[problem_id]
        record["orig_lang_html"] = problem_id_2_html_orig[problem_id]
        record["orig_lang_string"] = problem_id_2_text_orig[problem_id]
        record["codenet_problem_id"] = problem_id
        records.append(record)

print(f"Train records: {len(train_records)}")
print(f"Test records: {len(test_records)}")
print(f"Val records: {len(val_records)}")

print(train_records[0])



Train records: 2728
Test records: 585
Val records: 585
{'description_html': '<span class="lang-en">\n<p>Score : <var>600</var> points</p>\n<div class="part">\n<section>\n<h3>Problem Statement</h3>\n<p>Takahashi has <var>N</var> cards. The <var>i</var>-th of these cards has an integer <var>A_i</var> written on it.</p>\n<p>Takahashi will choose an integer <var>K</var>, and then repeat the following operation some number of times:</p>\n<ul>\n<li>Choose exactly <var>K</var> cards such that the integers written on them are all different, and eat those cards. (The eaten cards disappear.)</li>\n</ul>\n<p>For each <var>K = 1,2, \\ldots, N</var>, find the maximum number of times Takahashi can do the operation.</p>\n</section>\n</div>\n<div class="part">\n<section>\n<h3>Constraints</h3>\n<ul>\n<li><var> 1 \\le N \\le 3 \\times 10^5 </var></li>\n<li><var> 1 \\le A_i \\le N </var></li>\n<li>All values in input are integers.</li>\n</ul>\n</section>\n</div>\n<hr/>\n<div class="io-style">\n<div class

In [11]:
PATH_TO_OUTPUT = "/data1/shypula/Project_CodeNet/processed"
if not os.path.exists(PATH_TO_OUTPUT):
    os.makedirs(PATH_TO_OUTPUT)
# save each to a jsonl file, records and lines = True 
train_records_df = pd.DataFrame(train_records)
test_records_df = pd.DataFrame(test_records)
val_records_df = pd.DataFrame(val_records)

train_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "train_descriptions_and_testcases.jsonl"), lines=True, orient="records")
test_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "test_descriptions_and_testcases.jsonl"), lines=True, orient="records")
val_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "val_descriptions_and_testcases.jsonl"), lines=True, orient="records")


In [12]:
# let's now go over the metadatas, sample up to 3000 solutions for eaa
# ch problem
# and save them to a jsonl file

# first make problem_id_to_record 

problem_id_to_record = {}
for record in train_records + test_records + val_records:
    problem_id_to_record[record["codenet_problem_id"]] = record

# now let's write a function to sample up to 3000 solutions for each problem
lang_2_file_ending = {
    "Python": "py",
    "Java": "java",
    "C": "c",
    "C++": "cpp",
    "Rust": "rs",
    "Go": "go",
}

def sample_solutions_for_problem(problem_id, problem_id_to_record, n=3000, language="python"):   
    metadata = pd.read_csv(os.path.join(PATH_TO_METADATA, f"{problem_id}.csv"))
    lang_df = metadata[metadata["language"] == language]
    if len(lang_df) == 0:
        return []
    lang_df = lang_df.sample(n=min(n, len(lang_df)))
    ## now read from the data folder and add to the record
    lang_df["submission_id"] = lang_df["submission_id"].astype(str)
    submission_records = []
    for i, row in lang_df.iterrows():
        submission_id = row["submission_id"]
        submission_path = os.path.join(PATH_TO_DATA, problem_id, language, f"{submission_id}.{lang_2_file_ending[language]}")
        if os.path.exists(submission_path):
            with open(submission_path, "r") as f:
                code = f.read()
            row["code"] = code
            row_dict = row.to_dict()
            # add record to submission_records
            # row_dict = {**row_dict, **problem_id_to_record[problem_id]}
            submission_records.append(row_dict)
    return submission_records
    
    

    

In [13]:
# df = sample_solutions_for_problem("p00705", problem_id_to_record, n=3000, language="Python")
# pd.DataFrame(df)

# def sample_solutions_for_all_problems(problem_ids, problem_id_to_record, n=3000, language="python"):
#     all_records = []
#     for problem_id in tqdm(problem_ids):
#         records = sample_solutions_for_problem(problem_id, problem_id_to_record, n=n, language=language)
#         all_records.extend(records)
#     return all_records

In [17]:
import joblib
from joblib import Parallel, delayed
import contextlib


@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()
        
        
def sample_solutions_for_all_problems_parallel(problem_ids, problem_id_to_record, n=3000, language="python"):
    with tqdm_joblib(tqdm(total=len(problem_ids))) as pbar:
        all_records = Parallel(n_jobs=100, backend="threading")(delayed(sample_solutions_for_problem)(problem_id, problem_id_to_record, n=n, language=language) for problem_id in problem_ids)
    # concat all dfs by flattening 
    all_records = [record for records in all_records for record in records]
    all_records_df = pd.DataFrame(all_records)
    return all_records_df


        


In [18]:
# demo_parallel = sample_solutions_for_all_problems_parallel(valid_problem_ids[:10], problem_id_to_record, n=3000, language="Python")

In [19]:
train_records_df = sample_solutions_for_all_problems_parallel(train_problem_ids, problem_id_to_record, n=200, language="Python")
test_records_df = sample_solutions_for_all_problems_parallel(test_problem_ids, problem_id_to_record, n=200, language="Python")
val_records_df = sample_solutions_for_all_problems_parallel(val_problem_ids, problem_id_to_record, n=200, language="Python")

train_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "train_examples.jsonl"), lines=True, orient="records")
test_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "test_examples.jsonl"), lines=True, orient="records")
val_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "val_examples.jsonl"), lines=True, orient="records")

train_records_df = sample_solutions_for_all_problems_parallel(train_problem_ids, problem_id_to_record, n=200, language="C++")
test_records_df = sample_solutions_for_all_problems_parallel(test_problem_ids, problem_id_to_record, n=200, language="C++")
val_records_df = sample_solutions_for_all_problems_parallel(val_problem_ids, problem_id_to_record, n=200, language="C++")

train_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "train_examples_cpp.jsonl"), lines=True, orient="records")
test_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "test_examples_cpp.jsonl"), lines=True, orient="records")
val_records_df.to_json(os.path.join(PATH_TO_OUTPUT, "val_examples_cpp.jsonl"), lines=True, orient="records")


100%|██████████| 2728/2728 [08:01<00:00,  5.67it/s]
100%|██████████| 585/585 [01:51<00:00,  5.24it/s]
100%|██████████| 585/585 [01:34<00:00,  6.18it/s]
100%|██████████| 2728/2728 [14:54<00:00,  3.05it/s]
100%|██████████| 585/585 [03:10<00:00,  3.07it/s]
100%|██████████| 585/585 [03:11<00:00,  3.05it/s]


In [None]:
metadata_3000 = pd.read_csv(os.path.join(PATH_TO_METADATA, "p03245.csv"))
metadata_3000
