In [1]:
import glob
import json
import os
from typing import List

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle
from tqdm.notebook import tqdm

2024-06-25 23:47:24.737157: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-25 23:47:25.158404: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-25 23:47:25.190488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-25 23:47:25.190550: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

In [2]:
!mkdir 'raw' 'tfrec'

mkdir: cannot create directory ‘raw’: File exists
mkdir: cannot create directory ‘tfrec’: File exists


In [3]:
RANDOM_STATE = 42
MD_MAX_LEN = 64
TOTAL_MAX_LEN = 512
K_FOLDS = 5
FILES_PER_FOLD = 16
LIMIT = 10_000
MODEL_NAME = "microsoft/codebert-base"
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
INPUT_PATH = "../raw_data/AI4Code"



In [4]:
def read_notebook(path: str) -> pd.DataFrame:
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=os.path.basename(path).split(".")[0])
        .rename_axis("cell_id")
    )

def clean_code(cell: str) -> str:
    return str(cell).replace("\\n", "\n")

def sample_cells(cells: List[str], n: int) -> List[str]:
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return cells
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results

def get_features(df: pd.DataFrame) -> dict:
    features = {}
    for i, sub_df in tqdm(df.groupby("id"), desc="Features"):
        features[i] = {}
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[i]["total_code"] = total_code
        features[i]["total_md"] = total_md
        features[i]["codes"] = codes
    return features


def tokenize(df: pd.DataFrame, fts: dict) -> dict:
    input_ids = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    attention_mask = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    features = np.zeros((len(df),), dtype=np.float32)
    labels = np.zeros((len(df),), dtype=np.float32)

    for i, row in tqdm(
        df.reset_index(drop=True).iterrows(), desc="Tokens", total=len(df)
    ):
        row_fts = fts[row.id]

        inputs = TOKENIZER.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=MD_MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        code_inputs = TOKENIZER.batch_encode_plus(
            [str(x) for x in row_fts["codes"]] or [""],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        for x in code_inputs["input_ids"]:
            ids.extend(x[:-1])
        ids = ids[:TOTAL_MAX_LEN]
        if len(ids) != TOTAL_MAX_LEN:
            ids = ids + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(ids))

        mask = inputs["attention_mask"]
        for x in code_inputs["attention_mask"]:
            mask.extend(x[:-1])
        mask = mask[:TOTAL_MAX_LEN]
        if len(mask) != TOTAL_MAX_LEN:
            mask = mask + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(mask))

        input_ids[i] = ids
        attention_mask[i] = mask
        features[i] = (
            row_fts["total_md"] / (row_fts["total_md"] + row_fts["total_code"]) or 1
        )
        labels[i] = row.pct_rank

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "features": features,
        "labels": labels,
    }


def get_ranks(base: pd.Series, derived: List[str]) -> List[str]:
    return [base.index(d) for d in derived]


def _serialize_sample(
    input_ids: np.array,
    attention_mask: np.array,
    feature: np.float64,
    label: np.float64,
) -> bytes:
    feature = {
        "input_ids": tf.train.Feature(int64_list=tf.train.Int64List(value=input_ids)),
        "attention_mask": tf.train.Feature(
            int64_list=tf.train.Int64List(value=attention_mask)
        ),
        "feature": tf.train.Feature(float_list=tf.train.FloatList(value=[feature])),
        "label": tf.train.Feature(float_list=tf.train.FloatList(value=[label])),
    }
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize(
    input_ids: np.array,
    attention_mask: np.array,
    features: np.array,
    labels: np.array,
    path: str,
) -> None:
    with tf.io.TFRecordWriter(path) as writer:
        for args in zip(input_ids, attention_mask, features, labels):
            writer.write(_serialize_sample(*args))

## Collect Data

In [5]:
paths = glob.glob(os.path.join(INPUT_PATH, "train_data", "*.json"))
if LIMIT is not None:
    paths = paths[:LIMIT]
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths, desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

df_orders = pd.read_csv(
    os.path.join(INPUT_PATH, "train_orders.csv"),
    index_col="id")
df_orders = df_orders.squeeze().str.split()

df_orders_ = df_orders.to_frame().join(
    df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
    how="right",
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}
df_ranks = (
    pd.DataFrame.from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)

df_ancestors = pd.read_csv(
    os.path.join(INPUT_PATH, "train_ancestors.csv"), index_col="id"
)
df = (
    df.reset_index()
    .merge(df_ranks, on=["id", "cell_id"])
    .merge(df_ancestors, on=["id"])
)

df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")
df = df.sort_values("pct_rank").reset_index(drop=True)

features = get_features(df)

df = df[df["cell_type"] == "markdown"]
df = df.drop(["rank", "parent_id", "cell_type"], axis=1).dropna()

Concat:   0%|          | 0/10000 [00:00<?, ?it/s]

  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
  pd.r

Features:   0%|          | 0/10000 [00:00<?, ?it/s]

In [6]:
df.to_csv("data.csv")
with open("features.json", "w") as file:
    json.dump(features, file)

In [7]:
df = shuffle(df, random_state=RANDOM_STATE)

for fold, (_, split) in enumerate(
    GroupKFold(K_FOLDS).split(df, groups=df["ancestor_id"])
):
    print("=" * 36, f"Fold {fold}", "=" * 36)
    fold_dir = f"tfrec/{fold}"
    if not os.path.exists(fold_dir):
        os.mkdir(fold_dir)

    data = tokenize(df.iloc[split], features)

    np.savez_compressed(
        f"raw/{fold}.npz",
        input_ids=data["input_ids"],
        attention_mask=data["attention_mask"],
        features=data["features"],
        labels=data["labels"],
    )

    for split, index in tqdm(
        enumerate(np.array_split(np.arange(data["labels"].shape[0]), FILES_PER_FOLD)),
        desc=f"Saving",
        total=FILES_PER_FOLD,
    ):
        serialize(
            input_ids=data["input_ids"][index],
            attention_mask=data["attention_mask"][index],
            features=data["features"][index],
            labels=data["labels"][index],
            path=os.path.join(fold_dir, f"{split:02d}-{len(index):06d}.tfrec"),
        )



Tokens:   0%|          | 0/31343 [00:00<?, ?it/s]

Saving:   0%|          | 0/16 [00:00<?, ?it/s]



Tokens:   0%|          | 0/31343 [00:00<?, ?it/s]

Saving:   0%|          | 0/16 [00:00<?, ?it/s]



Tokens:   0%|          | 0/31342 [00:00<?, ?it/s]

Saving:   0%|          | 0/16 [00:00<?, ?it/s]



Tokens:   0%|          | 0/31342 [00:00<?, ?it/s]

Saving:   0%|          | 0/16 [00:00<?, ?it/s]



Tokens:   0%|          | 0/31342 [00:00<?, ?it/s]

Saving:   0%|          | 0/16 [00:00<?, ?it/s]