# Adapted From CodeRM (Federico Cassano)

In [None]:
import datasets
from tqdm import tqdm
import datetime
import json
import pickle
import zlib
import base64


def parse_date(date):
    date_format = '%Y-%m-%dT%H:%M:%S'
    return datetime.datetime.strptime(date, date_format)
import datetime
sonnet_cutoff = datetime.datetime(2023, 9, 30, 0, 0)

def decode_tests(tests):
    return json.loads(
                pickle.loads(
                    zlib.decompress(
                        base64.b64decode(tests)
                    )
                )
            )

# format we want:
# - question: has prompt
# - starter_code: has starter code, if any
# - difficulty: has difficulty
# - input_output: has tests, with fn_name key if needed
# - title: just for metadata
# - source: just for metadata
# - date: just for metadata
# - id: for unique id

def clean_and_push(ds, reponame):
    cleaned_ds = []
    for ex in tqdm(ds, total=len(ds)):
        public_raw_tests = json.loads(ex["public_test_cases"])
        raw_tests = decode_tests(ex["private_test_cases"]) + public_raw_tests
        tests = {"inputs": [], "outputs": []}
        public_tests = {"inputs": [], "outputs": []}
        metadata = json.loads(ex["metadata"])
        
        for test in raw_tests:
            inp = test["input"]
            out = test["output"]
            
            if "func_name" in metadata:
                inp = [json.loads(i) for i in inp.split("\n")]
                out = json.loads(out)
            
            tests["inputs"].append(inp)
            tests["outputs"].append(out)

        for test in public_raw_tests:
            inp = test["input"]
            out = test["output"]
            
            if "func_name" in metadata:
                inp = [json.loads(i) for i in inp.split("\n")]
                out = json.loads(out)
            
            public_tests["inputs"].append(inp)
            public_tests["outputs"].append(out)
    
        if "func_name" in metadata:
            name = metadata["func_name"]
            tests["fn_name"] = name
            public_tests["fn_name"] = name
            
        
        obj = {
            "question": ex["question_content"],
            "starter_code": ex["starter_code"],
            "difficulty": ex["difficulty"],
            "input_output": json.dumps(tests),
            "public_input_output": json.dumps(public_tests),
            "title": ex["question_title"],
            "source": ex["platform"],
            "date": ex["contest_date"],
            "id": ex["question_id"],
        }
        cleaned_ds.append(obj)
        
    cleaned_ds = datasets.Dataset.from_list(cleaned_ds)
    print("pushing to: ", reponame)
    cleaned_ds.push_to_hub(reponame, split="test", private=True)


In [None]:
ds = datasets.load_dataset("livecodebench/code_generation_lite", split="test", version_tag="release_v3")
ds

In [None]:
sonnet_cutoff = datetime.datetime(2024, 4, 1, 0, 0)
ds_decont = ds.filter(lambda ex: parse_date(ex["contest_date"]) >= sonnet_cutoff)
ds_decont

In [None]:
clean_and_push(ds_decont, "codegenning/livecodebench_lite_v3")

In [None]:
ds_C = ds.filter(lambda ex: parse_date(ex["contest_date"]) < sonnet_cutoff)
ds_C

In [None]:
clean_and_push(ds_C, "codegenning/livecodebench_lite_v3_C")