In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import random
# import re
# import time
# import pickle
import glob
import json
from typing import Optional, Tuple

# import numpy as np
import pandas as pd
# from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from torch.utils.data import random_split
# import torch.nn.functional as f
from torch.nn import MSELoss
import torch.nn as nn
# from torch.optim import AdamW

import transformers
from transformers import RobertaModel, RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer
# from transformers import TrainingArguments, Trainer
# from transformers import get_scheduler

import datasets
from datasets import load_metric

from sklearn.model_selection import train_test_split, StratifiedKFold, GroupShuffleSplit
# from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score, precision_score, accuracy_score
# from sklearn.utils.class_weight import compute_class_weight

print(transformers.__version__)

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
def set_seeds(seed=42):
    random.seed(seed)
#     np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seeds()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# load dataset

In [None]:
def read_notebook(path: str) -> pd.DataFrame:
    with open(path) as file:
        df = pd.DataFrame(json.load(file))
    df["id"] = os.path.splitext(os.path.basename(path))[0]
    return df

def expand_order(row: Tuple[str, str]) -> pd.DataFrame:
    cell_ids = row[1].split(" ")
    df = pd.DataFrame(
        {
            "id": [row[0] for _ in range(len(cell_ids))],
            "cell_id": cell_ids,
            "rank": range(len(cell_ids)),
        }
    )
    df["pct_rank"] = df["rank"] / len(df)
    return df

In [None]:
BASE_MODEL = '../input/robertabase/roberta-base'
SEQ_LEN = 512

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=True)

In [None]:
def create_encoding(source):
    return tokenizer(
        source,
        add_special_tokens=True,
        max_length = SEQ_LEN,
        truncation = True,
        padding = "max_length",
        return_tensors = 'pt'
      )

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, idx):
        item = create_encoding([self.texts[idx]])
#         item = {key: val[idx].clone().detach() for key, val in encodings.items()}
        if self.labels is not None:
            item['pct_rank'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.texts)

In [None]:
batch_size = 2

In [None]:
# create_encoding("my name is")['input_ids'].shape
# # for k, v in create_encoding("my name is"):
# #     print(k)
# #     print(v)

In [None]:
# for k,v in create_encoding(["my name is", 'nam']).items():
#     print(k)
#     print(v)
#     print(v.shape)

# fine-tune

In [None]:
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        self.num_labels = 1
        self.roberta = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
        self.config = RobertaConfig.from_pretrained(model_name)
        self.config.num_labels = self.num_labels
        self.classifier=RobertaClassificationHead(self.config)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, pct_rank=None):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)
        
        loss = None
        if pct_rank is not None:
            loss_fct = MSELoss()
            loss = loss_fct(logits.squeeze(), pct_rank)
        
        return loss, logits

In [None]:
print('Load model')
model = CustomModel(model_name=BASE_MODEL)
model = model.to(device)

# inference

In [None]:
paths = glob.glob(os.path.join('/', 'kaggle', "input", 'AI4Code', 'test', "*.json"))

test_df = pd.concat([read_notebook(x) for x in paths])
test_df = test_df.rename_axis("cell_id").reset_index()

# test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
# test_df["pct_rank"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

display(test_df)
# print(len(paths), len(test_df))

In [None]:
# tokenizing 할때 문제 발생하는 듯
# test_encoding = create_encoding(test_df[test_df['cell_type'] == 'markdown']['source'].tolist())
# len(test_encoding['input_ids'])

In [None]:
# test_dataset = CustomDataset(test_encoding, [1]*len(test_encoding['input_ids']))
test_dataset = CustomDataset(test_df[test_df['cell_type'] == 'markdown']['source'].tolist())
len(test_dataset)

In [None]:
test_dataset[0]['input_ids'].shape

In [None]:
test_dataloader = DataLoader(
    test_dataset, 
    batch_size=batch_size,
    num_workers=2,
    pin_memory=True,
)

In [None]:
model.load_state_dict(torch.load('../input/robertabase20036407541/roberta-base-2-0.0364-0.7541.pt'))


preds = []

model.eval()

for step, batch in enumerate(tqdm(test_dataloader)):
    batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}

    with torch.no_grad():
#         input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, pct_rank=None
#         _, logits = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
#         print(batch['input_ids'].shape)
        _, logits = model(**batch)
        logits = logits.detach().cpu().numpy()
        preds.extend(logits)
        
len(preds)

In [None]:
test_df.loc[test_df['cell_type']=='markdown', 'pct_rank'] = preds
test_df = test_df.sort_values("pct_rank").groupby("id", as_index=False)["cell_id"].apply(lambda x: " ".join(x))
test_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
# test_df

In [None]:
test_df.to_csv('submission.csv', columns=['id', 'cell_order'], index=False)