# Download and Unzip the datasets, if these steps haven't been done yet. 

In [1]:
import kaggle as k
import zipfile as Zip
import os 

directory_name = 'contradictory-my-dear-watson'
if not directory_name in os.listdir():
    !kaggle competitions download -c contradictory-my-dear-watson
    with Zip.ZipFile('.'.join([directory_name, 'zip']), 'r') as zip_ref:
        zip_ref.extractall(directory_name)
    
if '.'.join([directory_name, 'zip']) in os.listdir():
    os.remove('.'.join([directory_name, 'zip']))
    
assert directory_name in os.listdir()

# Review our datasets and load them into pandas DataFrames

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
files = os.listdir(directory_name)
sub_sample, test, train = [pd.read_csv('/'.join([directory_name, file])) for file in files]

print(f' Submission Sample has {sub_sample.shape[0]} rows, {sub_sample.shape[1]} columns')
print(f' Test Dataset has {test.shape[0]} rows, {test.shape[1]} columns')
print(f' Train Dataset has {train.shape[0]} rows, {train.shape[1]} columns')

 Submission Sample has 5195 rows, 2 columns
 Test Dataset has 5195 rows, 5 columns
 Train Dataset has 12120 rows, 6 columns


In [4]:
[print(f'Submission Sample has {len(sub_sample[col].unique())} unique values for {col} column') for col in sub_sample.columns];

Submission Sample has 5195 unique values for id column
Submission Sample has 1 unique values for prediction column


In [5]:
[print(f'Train has {len(train[col].unique())} unique values for {col} column') for col in train.columns];

Train has 12120 unique values for id column
Train has 8209 unique values for premise column
Train has 12119 unique values for hypothesis column
Train has 15 unique values for lang_abv column
Train has 15 unique values for language column
Train has 3 unique values for label column


In [6]:
[print(f'Test has {len(test[col].unique())} unique values for {col} column') for col in test.columns];

Test has 5195 unique values for id column
Test has 4336 unique values for premise column
Test has 5195 unique values for hypothesis column
Test has 15 unique values for lang_abv column
Test has 15 unique values for language column


# There are many languages inside Train and Test; therefore, in order to help the model stress less about solving for language, all observations will be translated to English. 

# Going one-by-one takes a long time (circa, 30 minutes per column, per dataset) so we'll throw this step into a multiprocessing pool.

In [7]:
%%writefile EngTranslator.py

from time import sleep
from random import randint
from googletrans import Translator, constants
tr = Translator()

def toEng(statement = '', lan = ''):
    if lan != 'en':
        sleep(randint(1, 5))
        return tr.translate(statement, dest = 'en').text
    else:
        return statement

Overwriting EngTranslator.py


In [8]:
import multiprocessing
from EngTranslator import toEng

data2translate = train.loc[:, ['premise', 'hypothesis', 'lang_abv']].values
 
if __name__ == '__main__':
    pool = multiprocessing.Pool(processes = 50)
    r_premise = pool.starmap(toEng, tqdm([[p,l] for p, _, l in data2translate]))
    r_hypothesis = pool.starmap(toEng, tqdm([[h,l] for _, h, l in data2translate]))
    
train['eng_premise'] = r_premise
train['eng_hypothesis'] = r_hypothesis

100%|███████████████████████████████████| 12120/12120 [04:50<00:00, 41.79it/s]
100%|███████████████████████████████████| 12120/12120 [04:41<00:00, 42.98it/s]


In [11]:
train.to_csv('/'.join([directory_name, 'trans_train.csv']))

In [10]:
# data2translate = test.loc[:, ['premise', 'hypothesis', 'lang_abv']].values
 
# if __name__ == '__main__':
#     pool = multiprocessing.Pool(processes = 50)
#     r_premise = pool.starmap(toEng, tqdm([[p,l] for p, _, l in data2translate]))
#     r_hypothesis = pool.starmap(toEng, tqdm([[h,l] for _, h, l in data2translate]))
    
# test['eng_premise'] = r_premise
# test['eng_hypothesis'] = r_hypothesis

In [None]:
# from googletrans import Translator, constants
# tr = Translator()
# tr.translate('Hello', dest = 'en').text