In [None]:
!mkdir 'raw' 'tfrec'

In [None]:
import glob
import json
import os
from typing import List

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle
from tqdm.notebook import tqdm

In [None]:
RANDOM_STATE = 42
MD_MAX_LEN = 64
TOTAL_MAX_LEN = 512
K_FOLDS = 5
FILES_PER_FOLD = 16
LIMIT = 1_000 if os.environ["KAGGLE_KERNEL_RUN_TYPE"] == "Interactive" else None
MODEL_NAME = "microsoft/codebert-base"
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
INPUT_PATH = "../input/AI4Code"

In [None]:
def read_notebook(path: str) -> pd.DataFrame:
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=os.path.basename(path).split(".")[0])
        .rename_axis("cell_id")
    )


def clean_code(cell: str) -> str:
    return str(cell).replace("\\n", "\n")


def sample_cells(cells: List[str], n: int) -> List[str]:
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return cells
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df: pd.DataFrame) -> dict:
    features = {}
    for i, sub_df in tqdm(df.groupby("id"), desc="Features"):
        features[i] = {}
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[i]["total_code"] = total_code
        features[i]["total_md"] = total_md
        features[i]["codes"] = codes
    return features


def tokenize(df: pd.DataFrame, fts: dict) -> dict:
    input_ids = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    attention_mask = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    features = np.zeros((len(df),), dtype=np.float32)
    labels = np.zeros((len(df),), dtype=np.float32)

    for i, row in tqdm(
        df.reset_index(drop=True).iterrows(), desc="Tokens", total=len(df)
    ):
        row_fts = fts[row.id]

        inputs = TOKENIZER.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=MD_MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        code_inputs = TOKENIZER.batch_encode_plus(
            [str(x) for x in row_fts["codes"]] or [""],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        for x in code_inputs["input_ids"]:
            ids.extend(x[:-1])
        ids = ids[:TOTAL_MAX_LEN]
        if len(ids) != TOTAL_MAX_LEN:
            ids = ids + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(ids))

            mask = inputs["attention_mask"]
        for x in code_inputs["attention_mask"]:
            mask.extend(x[:-1])
        mask = mask[:TOTAL_MAX_LEN]
        if len(mask) != TOTAL_MAX_LEN:
            mask = mask + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(mask))

        input_ids[i] = ids
        attention_mask[i] = mask
        features[i] = (
            row_fts["total_md"] / (row_fts["total_md"] + row_fts["total_code"]) or 1
        )
        labels[i] = row.pct_rank

        return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "features": features,
        "labels": labels,
    }


def get_ranks(base: pd.Series, derived: List[str]) -> List[str]:
    return [base.index(d) for d in derived]


def _serialize_sample(
    input_ids: np.array,
    attention_mask: np.array,
    feature: np.float64,
    label: np.float64,
) -> bytes:
    feature = {
        "input_ids": tf.train.Feature(int64_list=tf.train.Int64List(value=input_ids)),
        "attention_mask": tf.train.Feature(
            int64_list=tf.train.Int64List(value=attention_mask)
        ),
        "feature": tf.train.Feature(float_list=tf.train.FloatList(value=[feature])),
        "label": tf.train.Feature(float_list=tf.train.FloatList(value=[label])),
    }
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize(
    input_ids: np.array,
    attention_mask: np.array,
    features: np.array,
    labels: np.array,
    path: str,
) -> None:
     with tf.io.TFRecordWriter(path) as writer:
        for args in zip(input_ids, attention_mask, features, labels):
            writer.write(_serialize_sample(*args))

In [None]:
paths = glob.glob(os.path.join(INPUT_PATH, "train", "*.json"))
if LIMIT is not None:
    paths = paths[:LIMIT]
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths, desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
)

df_orders = pd.read_csv(
    os.path.join(INPUT_PATH, "train_orders.csv"),
    index_col="id",
    squeeze=True,
).str.split()
df_orders_ = df_orders.to_frame().join(
    df.reset_index("cell_id").groupby("id")["cell_id"].apply(list),
    how="right",
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {"cell_id": cell_id, "rank": get_ranks(cell_order, cell_id)}
df_ranks = (
    pd.DataFrame.from_dict(ranks, orient="index")
    .rename_axis("id")
    .apply(pd.Series.explode)
    .set_index("cell_id", append=True)
)

df_ancestors = pd.read_csv(
    os.path.join(INPUT_PATH, "train_ancestors.csv"), index_col="id"
)
df = (
    df.reset_index()
    .merge(df_ranks, on=["id", "cell_id"])
    .merge(df_ancestors, on=["id"])
)

df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")
df = df.sort_values("pct_rank").reset_index(drop=True)

features = get_features(df)

df = df[df["cell_type"] == "markdown"]
df = df.drop(["rank", "parent_id", "cell_type"], axis=1).dropna()

In [None]:
df.to_csv("data.csv")
with open("features.json", "w") as file:
    json.dump(features, file)

In [None]:
df = shuffle(df, random_state=RANDOM_STATE)

for fold, (_, split) in enumerate(
    GroupKFold(K_FOLDS).split(df, groups=df["ancestor_id"])
):
    print("=" * 36, f"Fold {fold}", "=" * 36)
    fold_dir = f"tfrec/{fold}"
    if not os.path.exists(fold_dir):
        os.mkdir(fold_dir)

    data = tokenize(df.iloc[split], features)

    np.savez_compressed(
        f"raw/{fold}.npz",
        input_ids=data["input_ids"],
        attention_mask=data["attention_mask"],
        features=data["features"],
        labels=data["labels"],
    )

    for split, index in tqdm(
        enumerate(np.array_split(np.arange(data["labels"].shape[0]), FILES_PER_FOLD)),
        desc=f"Saving",
        total=FILES_PER_FOLD,
    ):
        serialize(
            input_ids=data["input_ids"][index],
            attention_mask=data["attention_mask"][index],
            features=data["features"][index],
            labels=data["labels"][index],
            path=os.path.join(fold_dir, f"{split:02d}-{len(index):06d}.tfrec"),
        )

In [None]:
import os
from typing import List

import numpy as np
import tensorflow as tf
import transformers
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import KFold

In [None]:
!gsutil ls $GCS_PATH

In [None]:
!cmd

In [None]:
RANDOM_STATE = 42
N_SPLITS = 5
TOTAL_MAX_LEN = 512
BASE_MODEL = "microsoft/codebert-base"
GCS_PATH = KaggleDatasets().get_gcs_path("AI4Code")
EPOCHS = 5
LR = 3e-5
WARMUP_RATE = 0.05
VERBOSE = 1 if os.environ["KAGGLE_KERNEL_RUN_TYPE"] == "Interactive" else 2

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 64 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 4
print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

In [None]:
for i, (train_index, val_index) in enumerate(KFold(n_splits=N_SPLITS).split(range(N_SPLITS))):
    if TPU is not None:
        tf.tpu.experimental.initialize_tpu_system(TPU)

    train_filenames = np.ravel(
        [
            tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
            for x in train_index
        ]
    )
    steps_per_epoch = count_samples(train_filenames) // BATCH_SIZE
    train_dataset = get_dataset(train_filenames)

    val_filenames = np.ravel(
        [
            tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
            for x in val_index
        ]
    )
    validation_steps = count_samples(val_filenames) // BATCH_SIZE
    val_dataset = get_dataset(val_filenames, ordered=True, repeated=False, cached=True)

    with STRATEGY.scope():
        model = get_model()

        total_steps = steps_per_epoch * EPOCHS
        warmup_steps = int(WARMUP_RATE * total_steps)

        optimizer = transformers.AdamWeightDecay(
            learning_rate=WarmupLinearDecay(
                base_learning_rate=LR,
                warmup_steps=warmup_steps,
                total_steps=total_steps,
            ),
            weight_decay_rate=0.01,
            exclude_from_weight_decay=[
                "bias",
                "LayerNorm.bias",
                "LayerNorm.weight",
            ],
        )
        model.compile(loss="mae", optimizer=optimizer)

    model.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=validation_steps,
        epochs=EPOCHS,
        verbose=VERBOSE,
    )

    model.save_weights(f"model_{i}.h5")
    break