# Data Augmentation with Translation

Using translation to augment data and increasing the training datasize. Code taken from John Miller's [Augmenting Data with Translation](https://www.kaggle.com/jpmiller/augmenting-data-with-translations) kernel.

XLM-Roberta performs better when provided with translations. Here, Google translate is used to achieve the following - 
* Translate non-english sentences to English.
* Translate English sentences to a randomly chosen language

In [None]:
!pip -q install googletrans

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import random
from googletrans import Translator
from dask import bag, diagnostics

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

## Basic Exploration

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test["label"] = -1

In [None]:
df = pd.concat([train, test])
df.loc[df["label"]!=-1, "type"] = "train"
df.loc[df["label"]==-1, "type"] = "test"

In [None]:
plt.figure(figsize=(12, 8))
_ = sns.countplot(x="language", hue="type", data=df)
_ = plt.title("Language Distribution")

* Majority of the training & testing samples are of English language and the rest are in minority.
* Number of samples per language in training data appear to be similar
* Langauges in training and test dataset appear in similar ratios

In [None]:
plt.figure(figsize=(6, 4))
_ = sns.countplot(x="label", data=train)
_ = plt.title("Label Distribution")

In [None]:
del df
gc.collect()

## Translation

In [None]:
def translate(words, dest):
    dest_choices = ['zh-cn',
                    'ar',
                    'fr',
                    'sw',
                    'ur',
                    'vi',
                    'ru',
                    'hi',
                    'el',
                    'th',
                    'es',
                    'de',
                    'tr',
                    'bg'
                    ]
    if not dest:
        dest = np.random.choice(dest_choices)
        
    translator = Translator()
    decoded = translator.translate(words, dest=dest).text
    return decoded


#TODO: use a dask dataframe instead of all this
def trans_parallel(df, dest):
    premise_bag = bag.from_sequence(df.premise.tolist()).map(translate, dest)
    hypo_bag =  bag.from_sequence(df.hypothesis.tolist()).map(translate, dest)
    with diagnostics.ProgressBar():
        premises = premise_bag.compute()
        hypos = hypo_bag.compute()
    df[['premise', 'hypothesis']] = list(zip(premises, hypos))
    return df

Training Data Augmentation

In [None]:
eng = train.loc[train.lang_abv == "en"].copy().pipe(trans_parallel, dest=None)
non_eng =  train.loc[train.lang_abv != "en"].copy().pipe(trans_parallel, dest='en')
train = train.append([eng, non_eng])
train.to_csv("train_augmented.csv", index=False)

Testing Data Augmentation

In [None]:
eng = test.loc[test.lang_abv == "en"].copy().pipe(trans_parallel, dest=None)
non_eng =  test.loc[test.lang_abv != "en"].copy().pipe(trans_parallel, dest='en')
test = test.append([eng, non_eng])
test.to_csv("test_augmented.csv", index=False)

In [None]:
print(f"Augmented Training Data: {train.shape}")
print(f"Augmented Testing Data: {test.shape})

In [None]:
df = pd.concat([train, test])
df.loc[df["label"]!=-1, "type"] = "train"
df.loc[df["label"]==-1, "type"] = "test"

In [None]:
plt.figure(figsize=(12, 8))
_ = sns.countplot(x="language", hue="type", data=df)
_ = plt.title("Language Distribution")

Since, english sentences were translated to another language, chosen randomly. Number of sentences belonging to each class is quite different from the original setup 

In [None]:
plt.figure(figsize=(6, 4))
_ = sns.countplot(x="label", data=train)
_ = plt.title("Label Distribution")

Label distribution is the same as in the original dataset as expected.