# AI4Code Pytorch DistilBert Baseline + Toc Post-processing



Nearly 3% of notebooks have a table of content that can be used to reorder code. Notebooks with Toc are usually longer with 80 cases on average v.s. 45 cells without. In this notebooks I show how to read most of these ToC then how to post process predictions based on this.



Initially using https://www.kaggle.com/code/aerdem4/ai4code-pytorch-distilbert-baseline
Now using: https://www.kaggle.com/code/yuanzhezhou/ai4code-distilbert-training-777

In [None]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"

data_dir = Path('../input/AI4Code')

In [None]:
DEBUG = False
NUM_TRAIN = 1000 if DEBUG else 10000

df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

def get_ranks(base, derived):
    return [base.index(d) for d in derived]

df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

# Table of content EDA

In [None]:
df_md = df[df.cell_type=='markdown'].reset_index()
df_md['is_toc'] = df_md.source.str.lower().str.contains('table of content').astype('int')
contain_toc = df_md.groupby('id').is_toc.max()

print(f'proportion containing a table of content: {contain_toc.mean():.2%}')

In [None]:
pd.DataFrame({'lenght':df.reset_index().groupby('id').cell_id.count(),'contain_toc':contain_toc}).groupby('contain_toc').mean()

In [None]:
df_md[df_md.is_toc==1].pct_rank.hist(bins=100)

We can start with setting the position of the toc very low...

In [None]:
for i in range(2 if DEBUG else 10):
    print(df_md[df_md.is_toc==1].iloc[i].source)
    print('------')

In [None]:
import re

def process_toc(line):
    lst1 = re.findall(r'\"\#(.*)\"', line)
    lst2 = re.findall(r'\(\#(.*)\)', line)
    lst3 = re.findall(r'\- (.*)', line)
    if len(lst1):
        return(lst1)
    if len(lst2):
        return(lst2)
    else:
        return(lst3)
    
c = 0
for s in df_md[df_md.is_toc==1].source:
    process_toc(s)
    if len(process_toc(s)):
        c+=1

print(f' ugly but seems to process {c/len(df_md[df_md.is_toc==1].source):.2%} table of contents')

In [None]:
ids_toc = df_md[df_md.is_toc==1].id.unique()
ids_toc_i = 0

df_ex = df[(df['id'] == ids_toc[ids_toc_i])&(df['cell_type'] == 'markdown')]

In [None]:
from bs4 import BeautifulSoup

reg_lists = ['\=\'(.*)\'', '\(\#(.*)\)', '\=\"(.*)\">', '\"(.*)\"']

def get_id(line, reg_lists = reg_lists):
    m = []
    first = np.inf
    output = ''
    for r in reg_lists:
        match = re.search(r, line)
        if match:
            if (match.start()<10):
                first = match.start()
                output = line[match.start():match.end()]
    return(output)

def extract_id(line):
    output=''
    soup = BeautifulSoup(line)
    div = soup.find('a')
    if div is not None:
        output = div.get('id')
    return(output)


ids_toc[ids_toc_i]

print(df_md[(df_md.is_toc==1)&(df_md['id'] == ids_toc[ids_toc_i])].source.values[0])
print(process_toc(df_md[(df_md.is_toc==1)&(df_md['id'] == ids_toc[ids_toc_i])].source.values[0]))
print(df_ex.source.apply(extract_id))

In [None]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

In [None]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [None]:
y_dummy = val_df.groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

In [None]:
val_df["cell_type"].value_counts()

In [None]:
train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)

val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(val_df_mark["pct_rank"], np.ones(val_df_mark.shape[0])*train_df_mark["pct_rank"].mean())

In [None]:
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

MAX_LEN = 128

class MarkdownModel(nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_PATH)
        self.top = nn.Linear(768, 1)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.top(x[:, 0, :])
        return x

In [None]:
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):
    
    def __init__(self, df, max_len):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]
    
train_ds = MarkdownDataset(train_df_mark, max_len=MAX_LEN)
val_ds = MarkdownDataset(val_df_mark, max_len=MAX_LEN)

val_ds[0]

In [None]:
val_df_mark.iloc[0]

In [None]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 2:
        lr = 1e-5
    elif epoch < 5:
        lr = 5e-6
    else:
        lr = 1e-7

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [None]:
BS = 32
NW = 8

train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

In [None]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs[0], inputs[1])

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)


In [None]:

def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    
    optimizer = get_optimizer(model)

    criterion = torch.nn.MSELoss()
    
    for e in range(epochs):   
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):

            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs[0], inputs[1])

            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
        y_val, y_pred = validate(model, val_loader)
            
        print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
        print()
    return model, y_pred

# model = MarkdownModel()
# model = model.cuda()
# model, y_pred = train(model, train_loader, val_loader, epochs = 1 if DEBUG else 2)

In [None]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

test_df["pct_rank"] = 0
test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

len(test_ds), test_ds[0]

In [None]:
model = MarkdownModel()
model = model.cuda()
model.load_state_dict(torch.load('../input/mymodels2022v3/my_own_model_markdown_file_0_0.78169.bin'))

y_val, y_pred = validate(model, val_loader)
y_test = validate(model, test_loader)[1]

In [None]:
val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred

In [None]:
y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

In [None]:
def toc_post_process_sort(val_df):
    # sort values to match toc chapters
    val_df['is_toc'] = val_df.source.str.lower().str.contains('table of content')
    val_ids = val_df[val_df['is_toc']].id.unique()
    for id_str in val_ids:
        df_id = val_df[val_df['id']==id_str].copy()
        chapters = process_toc(df_id[df_id['is_toc']].source.values[0])
        df_id['identification'] = df_id.source.apply(extract_id)
        dti = pd.Series(np.sort(df_id[df_id.identification.isin(chapters)].pred.values),index=[c for c in chapters if c in df_id.identification.values])
        df_id['new_pred'] = df_id.identification.map(dti)
        val_df.loc[val_df['id']==id_str,'pred'] = np.where(~df_id['new_pred'].isna(),df_id['new_pred'],df_id['pred']).copy()
    return val_df

def toc_post_process_values(val_df):
    # Impute values:
    # 0 for toc
    # chapters to match toc chapters
    val_df['is_toc'] = val_df.source.str.lower().str.contains('table of content')
    val_ids = val_df[val_df['is_toc']].id.unique()
    for id_str in val_ids:
        try:
            df_id = val_df[val_df['id']==id_str].copy()
            chapters = process_toc(df_id[df_id['is_toc']].source.values[0])
            df_id['identification'] = df_id.source.apply(extract_id)
            idx = [c for c in chapters if c in df_id.identification.values]
            dti = pd.Series(((np.arange(len(idx)+1)+1)/(len(idx)+1))[:len(idx)],index=idx)
            df_id['new_pred'] = df_id.identification.map(dti)
            df_id.loc[df_id.is_toc,'new_pred'] = 0
            val_df.loc[val_df['id']==id_str,'pred'] = np.where(~df_id['new_pred'].isna(),df_id['new_pred'],df_id['pred']).copy()
        except:
            print(id_str)
    return val_df

In [None]:
toc_post_process_values(val_df)

In [None]:
y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

In [None]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

In [None]:
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

In [None]:
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

len(test_ds), test_ds[0]

In [None]:
_, y_test = validate(model, test_loader)

In [None]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

In [None]:
test_df = toc_post_process_values(test_df)

In [None]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)