# Translating Contradictory, My Dear Watson data using googletrans and Spanish as an intermediate language

This kernel translates and saves all of the hypotheses and premises in the Contradictory, My Dear Watson training and testing data. Translating everything, as you can probably guess, takes forever. I thought that I could help some folks out by doing it myself and making the output available.

All premises and hypotheses are first translated to Spanish and then to English. The reason that Spanish is being used is because it is the most common language that also uses the Latin alphabet. `googletrans` appears to have issues with sentences that mix Latin and non-Latin characters (such as Thai or Simplified Chinese).

In [None]:
pip install googletrans

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import random

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tqdm

import threading

from transformers import BertTokenizer, TFBertModel

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from googletrans import Translator

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def translate_sentences(translator, sentences, src_lang):
    src_lang_ = src_lang
    if src_lang == 'en':
        return sentences
    
    def translate_sentence_arr_(sentence_arr, src=None, dest=None):
        return [translator.translate(s, src=src, dest=dest).text for s in tqdm.tqdm(sentence_arr)]
    
    print(f'...translating from {src_lang} to es')
    trans_result = translate_sentence_arr_(sentences, src=src_lang_, dest='es')
    print(f'...translating from es to en')
    en_trans_result = translate_sentence_arr_(trans_result, src='es', dest='en')
    
    return en_trans_result

def translate_data(data_path):
    df = pd.read_csv(data_path)
    
    df['lang_abv'] = df['lang_abv'].replace(['zh'], 'zh-cn')
    
    lang_dfs = list(df.groupby(by='lang_abv'))
    
    tr_lang_dfs = []
    i = 0
    for lang, df in lang_dfs:
        if lang == 'en':
            continue
            
        tr_df = df.copy()
        
        print(f'Translating {lang} dataset')
        
        tr_df['premise'] = translate_sentences(Translator(), df['premise'].tolist(), lang)
        tr_df['hypothesis'] = translate_sentences(Translator(), df['hypothesis'].tolist(), lang)
        
        tr_lang_dfs.append(tr_df)
        
        display(tr_df)
            
        
    translated_df = pd.concat(tr_lang_dfs)
    
    return translated_df

In [None]:
# translate  and save test and training data
translate_data('/kaggle/input/contradictory-my-dear-watson/train.csv').to_csv('cmdw_tr-train.csv', index=False)
translate_data('/kaggle/input/contradictory-my-dear-watson/test.csv').to_csv('cmdw_tr-test.csv', index=False)