In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel
from transformers import RobertaTokenizer
import logging
import sys
from contextlib import contextmanager
import time
import random
import os
import pickle
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/config.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/merges.txt
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/vocab.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/tokenizer_config.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/pytorch_model.bin
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/special_tokens_map.json
/kaggle/input/roberta-transformers-pytorch/roberta-large-mnli/added_tokens.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/config.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/merges.txt
/kaggle/input/roberta-transformers-pytorch/roberta-base/vocab.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/tokenizer_config.json
/kaggle/input/roberta-transformers-pytorch/roberta-base/pytorch_model.bin
/kaggle/input/roberta-transformers-pytorch/roberta-base/special_tokens_map.json
/kaggle/input/ro

In [2]:
# ===============
# Settings
# ===============
SEED = 0
num_workers = 4
BATCH_SIZE = 24
max_len = 256
MODEL_PATH = '/kaggle/input/roberta-transformers-pytorch/roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# ===============
# Functions
# ===============

class CommonLitDataset(Dataset):
    def __init__(self, excerpt, tokenizer, max_len, target=None):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target = target

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        if self.target is not None:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "target": torch.tensor(self.target[item], dtype=torch.float32)
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)
            }


In [4]:
class roberta_model(nn.Module):
    def __init__(self):
        super(roberta_model, self).__init__()
        self.roberta = RobertaModel.from_pretrained(
            MODEL_PATH,
        )
        self.drop = nn.Dropout(0.2)
        self.fc = nn.Linear(768, 256)
        self.layernorm = nn.LayerNorm(256)
        self.drop2 = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.out = nn.Linear(256, 1)

    def forward(self, ids, mask, token_type_ids):
        # pooler
        emb = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids)[
            'pooler_output']
        output = self.drop(emb)
        output = self.fc(output)
        output = self.layernorm(output)
        output = self.drop2(output)
        output = self.relu(output)
        output = self.out(output)
        return output, emb

In [5]:
# model
model = roberta_model()
model.load_state_dict(torch.load(
    "/kaggle/input/ex014-model-weight/ex014_2.pth"))
model.to(device)
model.eval()
print("loaded model")

loaded model


In [6]:
train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
y = train["target"]
fold = 2
x_train, y_train = train.iloc[:], y.iloc[:]
x_test = test.iloc[:]



train_ = CommonLitDataset(
         x_train["excerpt"].values, tokenizer, max_len, y_train.values.reshape(-1, 1))
test_ = CommonLitDataset(
         x_test["excerpt"].values, tokenizer, max_len)


# loader
train_loader = DataLoader(
    dataset=train_, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(
    dataset=test_, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)



# make embedding
train_emb = np.ndarray((0, 768))
with torch.no_grad():
    for d in train_loader:
        # =========================
        # data loader
        # =========================
        input_ids = d['input_ids']
        mask = d['attention_mask']
        token_type_ids = d["token_type_ids"]
        target = d["target"]

        input_ids = input_ids.to(device)
        mask = mask.to(device)
        token_type_ids = token_type_ids.to(device)
        target = target.to(device)
        _, emb = model(input_ids, mask, token_type_ids)
        train_emb = np.concatenate(
            [train_emb, emb.detach().cpu().numpy()], axis=0)

        
test_emb = np.ndarray((0, 768))
with torch.no_grad():
    for d in test_loader:
        # =========================
        # data loader
        # =========================
        input_ids = d['input_ids']
        mask = d['attention_mask']
        token_type_ids = d["token_type_ids"]
        input_ids = input_ids.to(device)
        mask = mask.to(device)
        token_type_ids = token_type_ids.to(device)

        _, emb = model(input_ids, mask, token_type_ids)
        test_emb = np.concatenate(
            [test_emb, emb.detach().cpu().numpy()], axis=0)

In [7]:
X_train = train_emb.copy()
Y_train = y_train.values.reshape(-1, 1).copy()

In [8]:
kernel = RBF()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0).fit(X_train, Y_train)

In [9]:
X_test = test_emb.copy()
Y_test = Ymean_test = gpr.predict(X_test)

In [10]:
df_sub = pd.DataFrame()
df_sub['id'] = test['id']
df_sub['target'] = Y_test
df_sub.to_csv("submission.csv", index=False)