In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import metrics
from sklearn import preprocessing
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler, LabelEncoder
import math
%matplotlib inline
from glob import glob
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import gc
gc.enable()
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import AutoConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup
)
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
from IPython.display import clear_output
from tqdm import tqdm, trange
from sklearn import model_selection
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state = config_seed)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data

In [None]:
import random
config = {
    'batch_size': 130, #128
    'max_len':256,   # 256
}
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(seed= 42)
class JFDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode  
    def __len__(self):
        return len(self.excerpt)
def get_embeddings(df,path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()
    ds = JFDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                 # pin_memory = False,
                  pin_memory = True,
                  drop_last = False
                 )
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    del model
    del tokenizer
    del ds
    del dl
    del outputs
    del MODEL_PATH
    del device
    gc.collect()
    torch.cuda.empty_cache()
    return np.array(embeddings)

In [None]:
# X = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
# X = X.drop("f_27", axis = 1)
# train_f27 = pd.read_pickle("../input/f27-file/f27_X.pkl")
# train_f27 = get_embeddings(train_f27, '../input/myrobertabase')
# train_f27 = pd.DataFrame(train_f27)
# X = pd.concat([X, train_f27], axis = 1)
# X.to_pickle('tps05_X.pkl')
# del X

In [None]:
test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")
test_f27 = test[['f_27']]
test_f27.columns = ['excerpt']
test = test.drop("f_27", axis = 1)
test_f27 = get_embeddings(test_f27, '../input/myrobertabase')
test_f27 = pd.DataFrame(test_f27)
test = pd.concat([test, test_f27], axis = 1)
test.to_pickle('tps05_to_test.pkl')