# XLM-Roberta Large tokenize dataset

This kernel tokenizes the whole dataset ahead of time and saves it in npy file format for later loading in order to save time during training.

Based on [abhishek's](https://www.kaggle.com/abhishek/bert-multi-lingual-tpu-training-8-cores-w-valid) and [xhlulu's](https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta) kernels.

In [None]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import sys
from sklearn import metrics, model_selection

import warnings

warnings.filterwarnings("ignore")

In [None]:
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained('xlm-roberta-large', do_lower_case=True)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
df_train1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
df_train2 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
df_train2.toxic = df_train2.toxic.round().astype(int)

df_valid = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv', 
                       usecols=["comment_text", "toxic"])


# Combine train1 with a subset of train2
df_train = pd.concat([
    df_train1[['comment_text', 'toxic']],
    df_train2[['comment_text', 'toxic']].query('toxic==1'),
    df_train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

df_test = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/test.csv")

In [None]:
%%time
x_train = regular_encode(df_train.comment_text.values, tokenizer, maxlen=192)
x_valid = regular_encode(df_valid.comment_text.values, tokenizer, maxlen=192)
x_test  = regular_encode(df_test.content.values,       tokenizer, maxlen=192)

In [None]:
np.save('x_train',x_train)
np.save('x_valid',x_valid)
np.save('x_test',  x_test)

In [None]:
np.save('df_train_toxic',df_train.toxic.values)
np.save('df_valid_toxic',df_valid.toxic.values)

In [None]:
np.save('test_df_ids',df_test.id.values)