You can use translations to augment the training data and get extra power from your model. Variations of this method were used in the [Jigsaw Multilingual Toxic Comment Classification](https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification) and other competitions.

I'll demo the method on the [excellent notebook](https://www.kaggle.com/xhlulu/contradictory-watson-concise-keras-xlm-r-on-tpu) which scores 0.774. I'll translate non-English text to English and English text to another language.

In [None]:
!pip install git+https://github.com/ssut/py-googletrans.git

In [None]:
import numpy as np
import pandas as pd
from googletrans import Translator
from dask import bag, diagnostics


* Here's the training data with a quick check of language distribution.

In [None]:
train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv', index_col=['id'])
display(train, train.lang_abv.value_counts())

Googletrans is fairly fast. Even so, I highly recommend you use multiprocessing of some sort to speed translation.

In [None]:
def translate(words, dest):
    dest_choices = ['zh-cn',
                    'ar',
                    'fr',
                    'sw',
                    'ur',
                    'vi',
                    'ru',
                    'hi',
                    'el',
                    'th',
                    'es',
                    'de',
                    'tr',
                    'bg'
                    ]
    if not dest:
        dest = np.random.choice(dest_choices)
        
    translator = Translator()
    decoded = translator.translate(words, dest=dest).text
    return decoded


#TODO: use a dask dataframe instead of all this
def trans_parallel(df, dest):
    premise_bag = bag.from_sequence(df.premise.tolist()).map(translate, dest)
    hypo_bag =  bag.from_sequence(df.hypothesis.tolist()).map(translate, dest)
    with diagnostics.ProgressBar():
        premises = premise_bag.compute()
        hypos = hypo_bag.compute()
    df[['premise', 'hypothesis']] = list(zip(premises, hypos))
    return df

    
eng = train.loc[train.lang_abv == "en"].copy() \
           .pipe(trans_parallel, dest=None)

non_eng =  train.loc[train.lang_abv != "en"].copy() \
                .pipe(trans_parallel, dest='en')

train = train.append([eng, non_eng])

train.shape

In [None]:
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv', index_col=['id'])
submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv',
                            index_col=['id'])

Now we build a model, tune it, and predict as with the original. In addition to using the multilingual model, you can translate everything to a single language and use a model for that language. 

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import transformers
from transformers import TFAutoModel, AutoTokenizer

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
model_name = 'jplu/tf-xlm-roberta-large'
n_epochs = 8
max_len = 80

# Our batch size will depend on number of replicas
batch_size = 16 * strategy.num_replicas_in_sync

In [None]:
# First load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert the text so that we can feed it to `batch_encode_plus`
train_text = train[['premise', 'hypothesis']].values.tolist()
test_text = test[['premise', 'hypothesis']].values.tolist()

# Now, we use the tokenizer we loaded to encode the text
train_encoded = tokenizer.batch_encode_plus(
    train_text,
    pad_to_max_length=True,
    max_length=max_len
)

test_encoded = tokenizer.batch_encode_plus(
    test_text,
    pad_to_max_length=True,
    max_length=max_len
)

x_train, x_valid, y_train, y_valid = train_test_split(
    train_encoded['input_ids'], train.label.values, 
    test_size=0.2, random_state=2020
)

x_test = test_encoded['input_ids']

In [None]:
auto = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(batch_size)
    .prefetch(auto)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(batch_size)
    .cache()
    .prefetch(auto)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(batch_size)
)

In [None]:
with strategy.scope():
    # First load the transformer layer
    transformer_encoder = TFAutoModel.from_pretrained(model_name)

    # This will be the input tokens 
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")

    # Now, we encode the text using the transformers we just loaded
    sequence_output = transformer_encoder(input_ids)[0]

    # Only extract the token used for classification, which is <s>
    cls_token = sequence_output[:, 0, :]

    # Finally, pass it through a 3-way softmax, since there's 3 possible labels
    out = Dense(3, activation='softmax')(cls_token)

    # It's time to build and compile the model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=1e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )

In [None]:
n_steps = len(x_train) // batch_size

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=n_epochs
)

In [None]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=1
)

In [None]:
test_preds = model.predict(test_dataset, verbose=1)
submission['prediction'] = test_preds.argmax(axis=1)
submission.to_csv("submission.csv")