<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/35887/logos/header.png?t=2022-05-09-22-33-02">

<h1><center>[3/3] AI4Code TensorFlow TPU with CodeBert - Inference</center></h1>

This is the final part of my **AI4Code TensorFlow TPU with CodeBert** series:

* [1/3] [Data Preparation][1] (~5 hours)
* [2/3] [TPU Training][2] (~4 hours)
* **[3/3] GPU Inference ← (you're here)**

This is basically a translation of **[Khoi Nguyen's][3]** works [[1][4], [2][5]] from PyTorch to TensorFlow with minor changes and updates for TPU support. The **[original][4]** PyTorch work takes up to 40 hours per epoch on Kaggle GPU, whereas **[my version][2]** takes only 50 minutes per epoch on Kaggle TPU, so it's lightning fast ⚡.

### About Solution

- Input data: markdown + code context (512 tokens) + features
    - Markdown (up to 64 tokens)
    - Code context (all code cells or up to 20 code cells each up to 23 tokens)
    - Features: markdown cells to total cells ratio (appended to backbone outputs)
- Model and hyperparameters
    - CodeBert Base model
    - L1 loss (MAE)
    - AdamW optimizer
    - Learning rate schedule with warmup and linear decay
    - Total 5 epochs

### Input Data

- **[AI4Code-CodeBert-Weights][6]**: output from **[TPU Training][2]** step
- **[codebert-base][7]**: `microsoft/codebert-base` model saved locally (workaround for turned off internet requirement)

[1]: https://www.kaggle.com/nickuzmenkov/ai4code-tf-tpu-codebert-data-preparation
[2]: https://www.kaggle.com/nickuzmenkov/ai4code-tf-tpu-codebert-training
[3]: https://www.kaggle.com/suicaokhoailang
[4]: https://github.com/suicao/ai4code-baseline/tree/main/code
[5]: https://www.kaggle.com/code/suicaokhoailang/stronger-baseline-with-code-cells
[6]: https://www.kaggle.com/datasets/nickuzmenkov/ai4code-codebert-weights
[7]: https://www.kaggle.com/datasets/leolu1998/codebert-base

# Setup

In [None]:
import glob
import os
from typing import List

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tqdm.notebook import tqdm

In [None]:
BATCH_SIZE = 32
SLICES = 8
MD_MAX_LEN = 64
TOTAL_MAX_LEN = 512
STRATEGY = tf.distribute.get_strategy()
BASE_MODEL = "../input/codebert-base/codebert-base"
TOKENIZER = transformers.AutoTokenizer.from_pretrained(BASE_MODEL)
INPUT_PATH = "../input/AI4Code"

In [None]:
def read_notebook(path: str) -> pd.DataFrame:
    return (
        pd.read_json(path, dtype={"cell_type": "category", "source": "str"})
        .assign(id=os.path.basename(path).split(".")[0])
        .rename_axis("cell_id")
    )


def clean_code(cell: str) -> str:
    return str(cell).replace("\\n", "\n")


def sample_cells(cells: List[str], n: int) -> List[str]:
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return cells
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df: pd.DataFrame) -> dict:
    features = {}
    for i, sub_df in tqdm(df.groupby("id"), desc="Features"):
        features[i] = {}
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[i]["total_code"] = total_code
        features[i]["total_md"] = total_md
        features[i]["codes"] = codes
    return features


def tokenize(df: pd.DataFrame, fts: dict) -> dict:
    input_ids = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    attention_mask = np.zeros((len(df), TOTAL_MAX_LEN), dtype=np.int32)
    features = np.zeros((len(df),), dtype=np.float32)

    for i, row in tqdm(
        df.reset_index(drop=True).iterrows(), desc="Tokens", total=len(df)
    ):
        row_fts = fts[row.id]

        inputs = TOKENIZER.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=MD_MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        code_inputs = TOKENIZER.batch_encode_plus(
            [str(x) for x in row_fts["codes"]] or [""],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        for x in code_inputs["input_ids"]:
            ids.extend(x[:-1])
        ids = ids[:TOTAL_MAX_LEN]
        if len(ids) != TOTAL_MAX_LEN:
            ids = ids + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(ids))

        mask = inputs["attention_mask"]
        for x in code_inputs["attention_mask"]:
            mask.extend(x[:-1])
        mask = mask[:TOTAL_MAX_LEN]
        if len(mask) != TOTAL_MAX_LEN:
            mask = mask + [
                TOKENIZER.pad_token_id,
            ] * (TOTAL_MAX_LEN - len(mask))

        input_ids[i] = ids
        attention_mask[i] = mask
        features[i] = (
            row_fts["total_md"] / (row_fts["total_md"] + row_fts["total_code"]) or 1
        )

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "features": features,
    }


def get_ranks(base: pd.Series, derived: List[str]) -> List[str]:
    return [base.index(d) for d in derived]


def get_dataset(
    input_ids: np.array,
    attention_mask: np.array,
    feature: np.array,
) -> tf.data.Dataset:
    dataset = tf.data.Dataset.from_tensor_slices(
        {"input_ids": input_ids, "attention_mask": attention_mask, "feature": feature}
    )
    dataset = dataset.batch(BATCH_SIZE)
    return dataset.prefetch(tf.data.AUTOTUNE)


def get_model() -> tf.keras.Model:
    backbone = transformers.TFAutoModel.from_pretrained(BASE_MODEL)
    input_ids = tf.keras.layers.Input(
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    feature = tf.keras.layers.Input(
        shape=(1,),
        dtype=tf.float32,
        name="feature",
    )
    x = backbone({"input_ids": input_ids, "attention_mask": attention_mask})[0]
    x = tf.concat([x[:, 0, :], feature], axis=1)
    outputs = tf.keras.layers.Dense(1, activation="linear", dtype="float32")(x)
    return tf.keras.Model(
        inputs=[input_ids, attention_mask, feature],
        outputs=outputs,
    )

# Collect Data

In [None]:
paths = glob.glob(os.path.join(INPUT_PATH, "test", "*.json"))
df = (
    pd.concat([read_notebook(x) for x in tqdm(paths, desc="Concat")])
    .set_index("id", append=True)
    .swaplevel()
    .sort_index(level="id", sort_remaining=False)
).reset_index()
df["source"] = df["source"].str.slice(0, MD_MAX_LEN)
df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pct_rank"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

fts = get_features(df)

# Run Inference

In [None]:
with STRATEGY.scope():
    model = get_model()
    model.load_weights("../input/ai4code-codebert-weights/model_0.h5")

predict = np.array([], dtype=np.float32)

for chunk in tqdm(
    np.array_split(df[df["cell_type"] == "markdown"], SLICES), total=SLICES
):
    if chunk.empty:
        continue

    data = tokenize(chunk, fts)

    dataset = get_dataset(data["input_ids"], data["attention_mask"], data["features"])
    predict = np.r_[
        predict,
        model.predict(dataset).reshape(
            -1,
        ),
    ]

# Save Submission

In [None]:
df.loc[df["cell_type"] == "markdown", "pct_rank"] = predict
df = df.sort_values("pct_rank").groupby("id")["cell_id"].apply(" ".join)
df.name = "cell_order"
df.to_csv("submission.csv")

# Next Steps

Tweak previous steps and beat my score!

* [1/3] [Data Preparation][1] (~5 hours)
* [2/3] [TPU Training][2] (~4 hours)
* <span style="color:lightgray">[3/3] GPU Inference ← (you're here)</span>


[1]: https://www.kaggle.com/nickuzmenkov/ai4code-tf-tpu-codebert-data-preparation
[2]: https://www.kaggle.com/nickuzmenkov/ai4code-tf-tpu-codebert-training