In [None]:
from pathlib import Path

BERT_MODEL_PATH = 'microsoft/codebert-base'
MARK_PATH = 'weights/model_markdown_07840.pth'
CODE_PATH = 'weights/model_code.pth'
CODE_MARK_PATH = 'weights/model_code_mark_07575.pth'
CODE_MARK_RANK_PATH = 'weights/model_code_mark_rank.pth'
SIGMOID_PATH = 'weights/model_sigmoid_40_mae.pth'
FASTTEST_MODEL = 'weights/model140000.bin'
DATA_DIR = Path('AI4Code')
LABELS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'k',
          'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
MAX_TREE_DEPTH = 8
TREE_METHOD = 'gpu_hist'
SUBSAMPLE = 0.6
REGULARIZATION = 0.1
GAMMA = 0.3
POS_WEIGHT = 1
EARLY_STOP = 50
LEARNING_RATE = 0.01
NUM_TRAIN = 200
RANK_COUNT = 20
SIGMOID_RANK_COUNT = 10
MD_MAX_LEN = 64
CODE_MAX_LEN = 23
TOTAL_MAX_LEN = 512
MAX_LEN = 128
NVALID = 0.1
EPOCH = 5
BS = 2
NW = 1
RANKS = [i for i in range(0, RANK_COUNT + 1, 1)]
accumulation_steps = 32

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel


class MarkdownOnlyModel(nn.Module):
    def __init__(self):
        super(MarkdownOnlyModel, self).__init__()
        self.distill_bert = AutoModel.from_pretrained(BERT_MODEL_PATH)
        self.top = nn.Linear(768, 1)

    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.top(x[:, 0, :])
        x = torch.sigmoid(x)

        return x


class MarkdownRankModel(nn.Module):
    def __init__(self):
        super(MarkdownRankModel, self).__init__()
        self.model = AutoModel.from_pretrained(BERT_MODEL_PATH)
        self.top = nn.Linear(770, len(RANKS))
        self.activation = nn.LogSoftmax(dim=1)

    def forward(self, ids, mask, fts, code_lens):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts, code_lens), 1)

        x = self.top(x)
        x = self.activation(x)

        return x


class SigMoidModel(nn.Module):
    def __init__(self):
        super(SigMoidModel, self).__init__()
        self.model = AutoModel.from_pretrained(BERT_MODEL_PATH)
        self.top = nn.Linear(770, 1)

    def forward(self, ids, mask, fts, code_lens):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts, code_lens), 1)
        x = self.top(x)
        return x


In [None]:
import re
import sys
from bisect import bisect

import nltk
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from wordcloud import STOPWORDS

from config import LABELS, RANK_COUNT, RANKS

nltk.download('wordnet')
nltk.download('omw-1.4')
stemmer = WordNetLemmatizer()
stopwords = set(STOPWORDS)


def id_to_label(ids):
    return [LABELS.index(s) for s in ids]


def label_to_id(labels):
    return ''.join([LABELS[i] for i in labels])


def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()
    # return document

    # Lemmatization
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


def preprocess_code(cell):
    return str(cell).replace('\\n', '\n')[:200]


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


def get_features_mark(df, mode='train'):

    features = []
    df = df.sort_values('rank').reset_index(drop=True)

    for _, sub_df in tqdm(df.groupby('id')):

        mark_sub_df_all = sub_df[sub_df.cell_type == 'markdown']

        for i in range(0, mark_sub_df_all.shape[0]):
            mark = mark_sub_df_all.iloc[i]['cell_id']
            pct_rank = mark_sub_df_all.iloc[i]['pct_rank']

            feature = {
                'mark': mark,
                'pct_rank': pct_rank
            }

            features.append(feature)

    return features


def get_features_rank(df, mode='train'):

    features = []
    labels = []
    code_ranks = []
    df = df.sort_values('rank').reset_index(drop=True)

    for id, sub_df in tqdm(df.groupby('id')):

        mark_sub_df_all = sub_df[sub_df.cell_type == 'markdown']
        code_sub_df_all = sub_df[sub_df.cell_type == 'code']
        total_code_len = len(code_sub_df_all)
        total_md = mark_sub_df_all.shape[0]

        for i in range(0, mark_sub_df_all.shape[0]):
            for j in range(0, code_sub_df_all.shape[0], RANK_COUNT):
                code_sub_df = code_sub_df_all[j: j + RANK_COUNT]

                codes = code_sub_df['cell_id'].to_list()
                ranks = code_sub_df['rank'].values
                total_code = code_sub_df.shape[0]

                mark = mark_sub_df_all.iloc[i]['cell_id']
                rank = mark_sub_df_all.iloc[i]['rank']

                min_rank = 0 if j == 0 else ranks[0]
                max_rank = ranks[-1]

                relative = 1

                if total_code_len - j <= RANK_COUNT and rank > min_rank:
                    relative = 1
                else:
                    if rank < min_rank or rank > max_rank:
                        relative = 0

                code_rank = 0
                if relative == 1:
                    if j == 0 and rank < ranks[0]:
                        code_rank = 0
                    else:
                        sub_ranks = rank - ranks
                        sub_ranks[sub_ranks < 0] = 100000
                        code_rank = np.argmin(sub_ranks) + 1

                if len(ranks) < RANK_COUNT:
                    ranks = np.concatenate(
                        [ranks, np.ones(RANK_COUNT - len(ranks),) * ranks[-1]], 0)

                if mode == 'classification':
                    if relative == 1:
                        feature = {
                            'id': id,
                            'total_code': int(total_code),
                            'total_md': int(total_md),
                            'codes': codes,
                            'ranks': ranks,
                            'code_rank': code_rank,
                            'mark': mark,
                            'pct_rank': mark_sub_df_all.iloc[i]['pct_rank'],
                            'relative': relative,
                            'total_code_len': total_code_len
                        }
                        features.append(feature)
                elif mode == 'sigmoid':
                    if total_code_len > RANK_COUNT:
                        feature = {
                            'total_code': int(total_code),
                            'total_md': int(total_md),
                            'codes': codes,
                            'ranks': ranks,
                            'code_rank': code_rank,
                            'mark': mark,
                            'pct_rank': mark_sub_df_all.iloc[i]['pct_rank'],
                            'relative': relative,
                            'total_code_len': total_code_len
                        }
                        features.append(feature)
                else:
                    feature = {
                        'total_code': int(total_code),
                        'total_md': int(total_md),
                        'codes': codes,
                        'ranks': ranks,
                        'code_rank': code_rank,
                        'mark': mark,
                        'pct_rank': mark_sub_df_all.iloc[i]['pct_rank'],
                        'relative': relative,
                        'total_code_len': total_code_len
                    }
                    features.append(feature)
                labels.append(relative)
                code_ranks.append(code_rank)

    return np.array(features), np.array(labels), np.array(code_ranks)


def validate_markdown(model, val_loader, device):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    mark_ids = []
    mark_hash = {}

    with torch.no_grad():
        for idx, (ids, mask, _, id) in enumerate(tbar):
            with torch.cuda.amp.autocast():
                pred = model(ids.to(device), mask.to(device))
            preds += pred.detach().cpu().numpy().ravel().tolist()
            mark_ids += [label_to_id(i) for i in id]

    for mark, score in zip(mark_ids, preds):
        mark_hash[mark] = score

    return mark_hash


def validate_sigmoid(model, val_loader, device, threshold=0.5):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    total = 0
    zero_total = 0
    one_total = 0

    total_true = 0
    total_zero_true = 0
    total_one_true = 0
    relatives = []

    preds = []
    targets = []

    with torch.no_grad():
        for idx, (ids, mask, fts, _, code_lens, _, target, total_code_lens) in enumerate(tbar):
            with torch.cuda.amp.autocast():
                pred = model(ids.to(device), mask.to(device),
                             fts.to(device), code_lens.to(device))

            code_lens = (total_code_lens.detach().cpu().numpy().ravel()
                         <= RANK_COUNT).astype(np.int8)
            code_len_indexs = np.nonzero(code_lens == 1)[0]

            pred = torch.sigmoid(pred)
            pred = pred.detach().cpu().numpy().ravel()
            pred[code_len_indexs] = 1.0
            preds += pred.tolist()

            pred = (pred >= threshold).astype(np.int8)
            # pred = (pred | code_lens).astype(np.int8)
            # pred = pred + code_lens
            # pred = np.clip(pred, 0, 1)
            relatives += pred.tolist()

            target = target.detach().cpu().numpy().ravel()
            targets += target.tolist()

            zero_indexes = np.nonzero(target == 0)[0]
            one_indexes = np.nonzero(target == 1)[0]

            zero_target = target[zero_indexes]
            one_target = target[one_indexes]

            zero_pred = pred[zero_indexes]
            one_pred = pred[one_indexes]

            zero_total += len(zero_target)
            one_total += len(one_target)
            total += len(target)

            total_zero_true += np.sum((zero_pred ==
                                       zero_target).astype(np.int8))
            total_one_true += np.sum((one_pred == one_target).astype(np.int8))
            total_true += np.sum((pred == target).astype(np.int8))

    return total_true / total, total_zero_true / zero_total, total_one_true / one_total, relatives, targets, preds


def validate_rank_inference(model, val_loader, device):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    targets = []
    mark_ids = []
    mark_dict = {}
    rank_list = []

    with torch.no_grad():
        for _, (ids, mask, fts, code_len, target, cell_id, ranks) in enumerate(tbar):
            ranks = ranks.detach().cpu().numpy().tolist()
            with torch.cuda.amp.autocast():
                pred = model(ids.to(device), mask.to(device),
                             fts.to(device), code_len.to(device))
            pred = torch.argmax(pred, dim=1)
            preds.append(pred.detach().cpu().numpy().ravel())
            targets.append(target.detach().cpu().numpy().ravel())
            mark_ids += [label_to_id(i) for i in cell_id]
            rank_list += ranks

    preds, targets = np.concatenate(preds), np.concatenate(targets)

    for (id, pred, rank) in zip(mark_ids, preds, rank_list):
        if pred == 0:
            mark_dict[id] = pred
        else:
            mark_dict[id] = rank[pred - 1] + 1

    return preds, targets, accuracy_score(targets, preds), mark_dict


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  # O(N)
        j = bisect(sorted_so_far, u)  # O(log N)
        inversions += i - j
        sorted_so_far.insert(j, u)  # O(N)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0  # total inversions in predicted ranks across all instances
    total_2max = 0  # maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        # rank predicted order in terms of ground truth
        ranks = [gt.index(x) for x in pred]
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


def cal_kendall_tau_inference(df, mark_dict, final_pred, df_orders):
    df.loc[df['cell_type'] == 'code',
           'pred'] = df[df.cell_type == 'code']['rank']

    marks = df.loc[df['cell_type'] == 'markdown']['cell_id'].to_list()
    for mark in marks:
        if mark not in final_pred:
            final_pred[mark] = mark_dict[mark]

    pred = []
    cell_ids = []
    for cell_id in final_pred.keys():
        cell_ids.append(cell_id)
        pred.append(final_pred[cell_id])

    df_markdown_pred = pd.DataFrame(list(zip(cell_ids, pred)), columns=[
                                    'cell_id', 'markdown_pred'])
    df = df.merge(df_markdown_pred, on=['cell_id'], how='outer')

    df.loc[df['cell_type'] == 'markdown',
           'pred'] = df.loc[df['cell_type'] == 'markdown']['markdown_pred']

    df[['id', 'cell_id', 'cell_type', 'rank', 'pred']].to_csv('predict.csv')
    y_dummy = df.sort_values("pred").groupby('id')['cell_id'].apply(list)
    print("Preds score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy))


In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer


class MarkdownOnlyDataset(Dataset):

    def __init__(self, fts, dict_cellid_source, max_len):
        super().__init__()
        self.dict_cellid_source = dict_cellid_source
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_PATH)
        self.fts = fts

    def __getitem__(self, index):
        row = self.fts[index]

        inputs = self.tokenizer.encode_plus(
            self.dict_cellid_source[row['mark']],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([row['pct_rank']]), torch.LongTensor(id_to_label(row['mark']))

    def __len__(self):
        return len(self.fts)


class MarkdownRankNewDataset(Dataset):

    def __init__(self, dict_cellid_source, total_max_len, md_max_len, fts):
        super().__init__()
        self.dict_cellid_source = dict_cellid_source
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_PATH)
        self.fts = fts

    def __getitem__(self, index):
        row = self.fts[index]

        inputs = self.tokenizer.encode_plus(
            self.dict_cellid_source[row['mark']],
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        codes = row['codes']
        ranks = row['ranks']
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(self.dict_cellid_source[x]) for x in codes],
            add_special_tokens=True,
            max_length=CODE_MAX_LEN,
            padding='max_length',
            truncation=True
        )
        n_md = row['total_md']
        n_code = row['total_code']
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * \
                (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * \
                (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        label = row['code_rank']

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([len(codes) / RANK_COUNT]), torch.LongTensor([label]), torch.LongTensor(id_to_label(row['mark'])), torch.LongTensor(ranks)

    def __len__(self):
        return len(self.fts)


class SigMoidDataset(Dataset):

    def __init__(self, dict_cellid_source, total_max_len, md_max_len, fts):
        super().__init__()
        self.dict_cellid_source = dict_cellid_source
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_PATH)
        self.fts = fts

    def __getitem__(self, index):
        row = self.fts[index]

        inputs = self.tokenizer.encode_plus(
            self.dict_cellid_source[row['mark']],
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        codes = row['codes']
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(self.dict_cellid_source[x]) for x in codes],
            add_special_tokens=True,
            max_length=CODE_MAX_LEN,
            padding='max_length',
            truncation=True
        )
        n_md = row['total_md']
        n_code = row['total_code']
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * \
                (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * \
                (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        label = row['pct_rank']
        relative = row['relative']
        total_code_len = row['total_code_len']

        loss_mask = torch.ones(RANK_COUNT + 1,)
        loss_mask[:len(codes) + 1] = 0
        loss_mask = loss_mask.type(torch.ByteTensor)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, loss_mask, torch.FloatTensor([len(codes) / RANK_COUNT]), torch.FloatTensor([label]), torch.FloatTensor([relative]), torch.FloatTensor([total_code_len])

    def __len__(self):
        return len(self.fts)


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

device = 'cuda'
torch.cuda.empty_cache()
np.random.seed(0)
torch.manual_seed(0)

model = MarkdownRankModel()
model.load_state_dict(torch.load(CODE_MARK_RANK_PATH))
model = model.cuda()

model_sigmoid = SigMoidModel().to(device)
model_sigmoid.load_state_dict(torch.load(SIGMOID_PATH))
model_sigmoid = model_sigmoid.cuda()

model_mark_only = MarkdownOnlyModel()
model_mark_only.load_state_dict(torch.load(MARK_PATH))
model_mark_only = model_mark_only.cuda()

paths_test = list((DATA_DIR / 'train').glob('*.json'))[-1000:]
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]

df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df.reset_index(inplace=True)

df_orders = pd.read_csv(
    DATA_DIR / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

df.loc[df['cell_type'] == 'markdown', 'source'] = df[df['cell_type']
                                                     == 'markdown'].source.apply(preprocess_text)

df.loc[df['cell_type'] == 'code', 'source'] = df[df['cell_type']
                                                 == 'code'].source.apply(preprocess_code)

dict_cellid_source = dict(
    zip(df['cell_id'].values, df['source'].values))

df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df = df.sort_values('rank').reset_index(drop=True)
df["pct_rank"] = df["rank"] / \
    df.groupby("id")["cell_id"].transform("count")

val_fts, _, _ = get_features_rank(df, 'test')
val_fts_only = get_features_mark(df, 'test')

val_ds = SigMoidDataset(dict_cellid_source, md_max_len=MD_MAX_LEN,
                        total_max_len=512, fts=val_fts)
val_ds_only = MarkdownOnlyDataset(val_fts_only, dict_cellid_source, 128)

val_loader = DataLoader(val_ds, batch_size=BS * 8, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)
val_loader_only = DataLoader(val_ds_only, batch_size=BS, shuffle=False, num_workers=NW,
                             pin_memory=False, drop_last=False)

acc, true, false, relative, _, _ = validate_sigmoid(
    model_sigmoid, val_loader, device, 0.397705)
print(acc, true, false)
mark_dict = validate_markdown(model_mark_only, val_loader_only, device)

class_fts = []
one_object = {}
mark_id_dict = {}
for i in range(len(relative)):
    if relative[i] == 1:
        class_fts.append(val_fts[i])
        if val_fts[i]['mark'] not in one_object:
            one_object[val_fts[i]['mark']] = 1
        else:
            del one_object[val_fts[i]['mark']]

for ft in val_fts:
    if ft['mark'] not in one_object:
        mark_id_dict[ft['mark']] = mark_dict[ft['mark']] * \
            (ft['total_code'] + ft['total_md'])

val_ds = MarkdownRankNewDataset(dict_cellid_source, md_max_len=MD_MAX_LEN,
                                total_max_len=512, fts=class_fts)
val_loader = DataLoader(val_ds, batch_size=BS * 8, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)


y_pred, _, acc, mark_dict = validate_rank_inference(model, val_loader, device)
cal_kendall_tau_inference(df, mark_id_dict, mark_dict, df_orders)