In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
pd.options.display.width = 180
pd.options.display.max_colwidth = 120

# data_dir = Path('./input')

In [None]:
import os

os.makedirs("./outputs", exist_ok=True)

In [None]:
import torch

model_name_or_path = 'microsoft/codebert-base'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#metric.py
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


In [None]:
#model.py
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)
        
    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = self.top(torch.cat((x[:, 0, :], fts),1))
        return x


#dataset.py
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_code"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]

In [None]:
# 50k

# train_df_mark = pd.read_csv('../input/ai4code/data/data/train_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open('../input/ai4code/data/data/train_fts.json'))
# val_df_mark = pd.read_csv('../input/ai4code/data/data/val_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open('../input/ai4code/data/data/val_fts.json'))
# val_df = pd.read_csv('../input/ai4code/data/data/val.csv')

In [None]:
# 20k

# train_df_mark = pd.read_csv('../input/data2/train_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open('../input/data2/train_fts_2.json'))
# val_df_mark = pd.read_csv('../input/data2/val_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open('../input/data2/val_fts_2.json'))
# val_df = pd.read_csv('../input/data2/val_2.csv')

In [None]:
# # 1k

# train_df_mark = pd.read_csv('../input/data-1k/data_1k/train_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open('../input/data-1k/data_1k/train_fts.json'))
# val_df_mark = pd.read_csv('../input/data-1k/data_1k/val_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open('../input/data-1k/data_1k/val_fts.json'))
# val_df = pd.read_csv('../input/data-1k/data_1k/val.csv')

In [None]:
data_dir = Path('../input/AI4Code')

order_df = pd.read_csv(data_dir / "train_orders.csv").set_index("id")
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()


In [None]:
md_max_len = 64
total_max_len = 512
batch_size = 8
accumulation_steps = 4
epochs = 2
n_workers = 8


# train_ds = MarkdownDataset(train_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
#                            total_max_len=total_max_len, fts=train_fts)
# val_ds = MarkdownDataset(val_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
#                          total_max_len=total_max_len, fts=val_fts)
# train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
#                           pin_memory=False, drop_last=True)
# val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
#                         pin_memory=False, drop_last=False)

In [None]:
def read_data(data):
    return tuple(d.to(device) for d in data[:-1]), data[-1].to(device)


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)

from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
# def train(model, train_loader, val_loader, epochs):
#     np.random.seed(0)
#     # Creating optimizer and lr schedulers
#     param_optimizer = list(model.named_parameters())
#     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#     optimizer_grouped_parameters = [
#         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#     ]

#     num_train_optimization_steps = int(epochs * len(train_loader) / accumulation_steps)
#     optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5,
#                       correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
#                                                 num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

#     criterion = torch.nn.L1Loss()
#     scaler = torch.cuda.amp.GradScaler()

#     for e in range(epochs):
#         model.train()
#         tbar = tqdm(train_loader, file=sys.stdout)
#         loss_list = []
#         preds = []
#         labels = []

#         for idx, data in enumerate(tbar):
#             inputs, target = read_data(data)

#             with torch.cuda.amp.autocast():
#                 pred = model(*inputs)
#                 loss = criterion(pred, target)
#             scaler.scale(loss).backward()
#             if idx % accumulation_steps == 0 or idx == len(tbar) - 1:
#                 scaler.step(optimizer)
#                 scaler.update()
#                 optimizer.zero_grad()
#                 scheduler.step()

#             loss_list.append(loss.detach().cpu().item())
#             preds.append(pred.detach().cpu().numpy().ravel())
#             labels.append(target.detach().cpu().numpy().ravel())

#             avg_loss = np.round(np.mean(loss_list), 4)

#             tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")

#         y_val, y_pred = validate(model, val_loader)
#         val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
#         val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
#         y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
#         print("Preds score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy))
#         torch.save(model.state_dict(), "./outputs/model_1k_23.bin")

#     return model, y_pred


In [None]:
# model = MarkdownModel(model_name_or_path)
# model = model.to(device)


In [None]:
# model, y_pred = train(model, train_loader, val_loader, epochs=epochs)


# Train 끝!

In [None]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

In [None]:
data_dir = Path('../input/AI4Code')
paths_test = list((data_dir / 'test').glob('*.json'))

notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]

test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

# 각 노트북 내에서 code셀&md셀 각각 번호가 0부터 매겨짐
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount() #cumcount(): 각 그룹의 각 항목에 0부터 번호를 매김!
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

In [None]:
# test_df

In [None]:
# Additional code cells

#preprocess.py -11
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [None]:
test_fts = get_features(test_df)

In [None]:
def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    model.eval()
    model.load_state_dict(torch.load(ckpt_path))
    BS = 32
    NW = 8
    MAX_LEN = 64
    test_df["pct_rank"] = 0
    test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), md_max_len=64,total_max_len=512, model_name_or_path=model_path, fts=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                              pin_memory=False, drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [None]:
model_path = "../input/codebertbase/codebert-base"

# ckpt_path = "../input/ai4codemodelspublic/model.bin"
ckpt_path = "../input/outpust-1k-23/model_1k_23.bin"

In [None]:
# !apt install git-lfs

# !git lfs install


In [None]:
y_test_2 = predict(model_path, ckpt_path)

In [None]:
# y_test = (y_test_1 + y_test_2)/2
y_test = y_test_2

In [None]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

In [None]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)