In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

os.environ["WANDB_API_KEY"] = "0" ## to silence warning

# nlp augmentation
!pip install --quiet google_trans_new
from google_trans_new import google_translator  

from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf

#for fast parallel processing
from dask import bag, diagnostics

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

train.head()

In [None]:
BACKTRSL_DEPTH = 1

BACKTRSL_TIMES = 1

In [None]:
def back_translate(sequence):
    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    
    translator = google_translator()
    
    org_lang = translator.detect(sequence)[0]
    used_languages = [org_lang]
    for i in range(BACKTRSL_DEPTH):
        #randomly choose language to translate sequence to  
        used_languages.append(np.random.choice([lang for lang in languages if lang is not used_languages]))
    
    if org_lang in languages:
        for i in range(BACKTRSL_DEPTH):
            #translate to new language
            #translated = translator.translate(sequence, dest = used_languages[i+1]).text
            translated = translator.translate(sequence, lang_tgt = used_languages[i+1], lang_src = used_languages[i])
        #translate back to original language
        #translated_back = translator.translate(translated, dest = org_lang).text
        translated_back = translator.translate(translated, lang_tgt = org_lang, lang_src = used_languages[-1])
    
        output_sequence = translated_back
            
    #if detected language not in our list of languages, do nothing
    else:
        output_sequence = sequence
    
    return output_sequence

In [None]:
def translate(sequence, lang = 'vi'):
    
    #instantiate translator
    translator = google_translator()
    
    org_lang = translator.detect(sequence)[0]
    
    translated = translator.translate(sequence, lang_tgt = lang, lang_src = org_lang)
        
    output_sequence = translated
    
    return output_sequence

In [None]:
#applies above define function with Dask
def upsampling_parallel(dataset, language):
    if(language == 'backtrsl'):
        prem_bag = bag.from_sequence(dataset['premise'].tolist()).map(back_translate)
        hyp_bag =  bag.from_sequence(dataset['hypothesis'].tolist()).map(back_translate)
    else:
        prem_bag = bag.from_sequence(dataset['premise'].tolist()).map(lambda x: translate(x, lang = language))
        hyp_bag =  bag.from_sequence(dataset['hypothesis'].tolist()).map(lambda x: translate(x, lang = language))
        
    with diagnostics.ProgressBar():
        prems = prem_bag.compute()
        hyps = hyp_bag.compute()

    #pair premises and hypothesis
    dataset[['premise', 'hypothesis']] = list(zip(prems, hyps))
    
    return dataset

In [None]:
"""train_bg = upsampling_parallel(train, "bg")
train_bg[['lang_abv', 'language']] = ['bg', 'Bulgarian']
train_bg.head()

train_bg.to_csv("train_translated_bg.csv", index = False)"""

In [None]:
train_backtrsl = upsampling_parallel(test, 'backtrsl')

train_backtrsl.to_csv("train_backtrsl.csv", index = False)

In [None]:
"""test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

test_backtrsl1 = upsampling_parallel(test, 'backtrsl')
test_backtrsl2 = upsampling_parallel(test, 'backtrsl')
test_backtrsl3 = upsampling_parallel(test, 'backtrsl')

test_backtrsl1.to_csv("test_backtrsl1.csv", index = False)
test_backtrsl2.to_csv("test_backtrsl2.csv", index = False)
test_backtrsl3.to_csv("test_backtrsl3.csv", index = False)"""