# Intro

This notebook is basically a translation of __[AI4Code Pytorch DistilBert Baseline][0]__ to help out TensorFlow users. I shortened the original work leaving only its bare bones - __[Data Preparation][1]__, __[Model Training][2]__, and __[Inference][3]__.

### TPU usage

This notebook is updated for TPU usage: just select TPU v3-8 as accelerator to train your model with all the competition data and get 5x performance boost! 

__Warning__: accroding to the competition rules you won't be able to submit your notebook if it uses TPU. Train your model in a separate notebook as a workaround.

# Setup

[0]: https://www.kaggle.com/code/aerdem4/ai4code-pytorch-distilbert-baseline
[1]: #data_preparation
[2]: #training
[3]: #inference

In [None]:
import glob
import json
import os
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

In [None]:
DATA_PATH = "../input/AI4Code"
BASE_MODEL = "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"
N_SPLITS = 5
SEQ_LEN = 128
RANDOM_STATE = 42

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 128 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 32
    LIMIT = 10_000

print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

In [None]:
def read_notebook(path: str) -> pd.DataFrame:
    with open(path) as file:
        df = pd.DataFrame(json.load(file))
    df["id"] = os.path.splitext(os.path.basename(path))[0]
    return df


def expand_order(row: Tuple[str, str]) -> pd.DataFrame:
    cell_ids = row[1].split(" ")
    df = pd.DataFrame(
        {
            "id": [row[0] for _ in range(len(cell_ids))],
            "cell_id": cell_ids,
            "rank": range(len(cell_ids)),
        }
    )
    df["pct_rank"] = df["rank"] / len(df)
    return df


def tokenize(source: pd.Series) -> Tuple[np.array, np.array]:
    tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=True)

    input_ids = np.zeros((len(source), SEQ_LEN), dtype="int32")
    attention_mask = np.zeros((len(source), SEQ_LEN), dtype="int32")

    for i, x in enumerate(tqdm(source, total=len(source))):
        encoding = tokenizer.encode_plus(
            x,
            None,
            add_special_tokens=True,
            max_length=SEQ_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        input_ids[i] = encoding["input_ids"]
        attention_mask[i] = encoding["attention_mask"]

    return input_ids, attention_mask


def get_dataset(
    input_ids: np.array,
    attention_mask: np.array,
    labels: Optional[np.array] = None,
    ordered: bool = False,
    repeated: bool = False,
) -> tf.data.Dataset:
    if labels is not None:
        dataset = tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            {"input_ids": input_ids, "attention_mask": attention_mask}
        )
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


def get_model() -> tf.keras.Model:
    backbone = transformers.TFDistilBertModel.from_pretrained(BASE_MODEL)
    input_ids = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    x = backbone(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        },
    )
    outputs = tf.keras.layers.Dense(1, activation="linear", dtype="float32")(x[0][:, 0, :])

    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=outputs,
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
        loss=tf.keras.losses.MeanSquaredError(),
    )
    return model

# <a id="data_preparation">Data Preparation</a>

The idea behind this is simple:
    
1. Get only markdown cells from the training notebooks
2. Teach the model to predict relative position (or `pct_rank`, or normalized rank) of the markdown cell given only its text and no other context

In [None]:
paths = glob.glob(os.path.join(DATA_PATH, "train", "*.json"))
if LIMIT is not None:
    paths = paths[:LIMIT]

source_df = pd.concat([read_notebook(x) for x in tqdm(paths, total=len(paths))])

source_df = source_df[source_df["cell_type"] == "markdown"]
source_df = source_df.drop("cell_type", axis=1)
source_df = source_df.rename_axis("cell_id").reset_index()

order_df = pd.read_csv(os.path.join(DATA_PATH, "train_orders.csv"), index_col="id")
order_df = pd.concat(
    [expand_order(row) for row in tqdm(order_df.itertuples(), total=len(order_df))]
)

ancestors_df = pd.read_csv(
    os.path.join(DATA_PATH, "train_ancestors.csv"),
    usecols=["id", "ancestor_id"],
    index_col="id",
)

df = source_df.merge(order_df, on=["id", "cell_id"]).merge(ancestors_df, on="id")
df = df.dropna()
display(df)

In [None]:
input_ids, attention_mask = tokenize(df["source"])

labels = df["pct_rank"].to_numpy()
groups = df["ancestor_id"].to_numpy()

print("input_ids:", input_ids.shape)
print("attention_mask:", attention_mask.shape)
print("labels:", labels.shape)
print("groups:", groups.shape)

# <a id="training">Model Training</a>

KFold ensembles can get you an extra boost. Just remove the `break` statement at the bottom to run all `N_SPLITS`.

In [None]:
input_ids, attention_mask, labels, groups = shuffle(
    input_ids, attention_mask, labels, groups, random_state=RANDOM_STATE
)
kfold = GroupKFold(n_splits=N_SPLITS)

for i, (train_index, val_index) in enumerate(kfold.split(input_ids, labels, groups=groups)):
    if TPU is not None:
        tf.tpu.experimental.initialize_tpu_system(TPU)

    with STRATEGY.scope():
        model = get_model()
        model.summary()

    train_dataset = get_dataset(
        input_ids=input_ids[train_index],
        attention_mask=attention_mask[train_index],
        labels=labels[train_index],
        repeated=True,
    )
    val_dataset = get_dataset(
        input_ids=input_ids[val_index],
        attention_mask=attention_mask[val_index],
        labels=labels[val_index],
        ordered=True,
    )

    model.fit(
        train_dataset,
        validation_data=val_dataset,
        steps_per_epoch=len(train_index) // BATCH_SIZE,
        epochs=1,
        verbose=2,
    )

    model.save_weights(f"model_{i}.h5")
    break

# <a id="inference">Inference</a>

Move this part to a separate notebook to be able to train your models on TPU and load them here simply as:

```python
model = get_model()
model.load_weights("../path/to/your/dataset/model_0.h5")
```

There are only 4 notebooks in the test folder available at the runtime, but there will be over 20,000 when you submit it!

In [None]:
paths = glob.glob(os.path.join(DATA_PATH, "test", "*.json"))

df = pd.concat([read_notebook(x) for x in tqdm(paths, total=len(paths))])
df = df.rename_axis("cell_id").reset_index()

df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pct_rank"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

display(df)

In [None]:
input_ids, attention_mask = tokenize(df[df["cell_type"] == "markdown"]["source"])
test_dataset = get_dataset(
    input_ids=input_ids,
    attention_mask=attention_mask,
    ordered=True,
)
y_pred = model.predict(test_dataset)

In [None]:
df.loc[df["cell_type"] == "markdown", "pct_rank"] = y_pred
df = df.sort_values("pct_rank").groupby("id", as_index=False)["cell_id"].apply(lambda x: " ".join(x))
df.rename(columns={"cell_id": "cell_order"}, inplace=True)
df.to_csv("submission.csv", index=False)
display(df)