# Download and Unzip the datasets, if these steps haven't been done yet. 

In [1]:
import kaggle as k
import zipfile as Zip
import os 

directory = 'contradictory-my-dear-watson'
if not directory in os.listdir():
    !kaggle competitions download -c contradictory-my-dear-watson
    with Zip.ZipFile('.'.join([directory, 'zip']), 'r') as zip_ref:
        zip_ref.extractall(directory)
    
if '.'.join([directory, 'zip']) in os.listdir():
    os.remove('.'.join([directory, 'zip']))
    
assert directory in os.listdir()

# Review our datasets and load them into pandas DataFrames

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from IPython.display import display

In [3]:
files = os.listdir(directory)
sub_sample, test, train = [pd.read_csv('/'.join([directory, file])) for file in files if os.path.splitext(file)[1] == '.csv']

print(f' Submission Sample has {sub_sample.shape[0]} rows, {sub_sample.shape[1]} columns')
print(f' Test Dataset has {test.shape[0]} rows, {test.shape[1]} columns')
print(f' Train Dataset has {train.shape[0]} rows, {train.shape[1]} columns')

 Submission Sample has 5195 rows, 2 columns
 Test Dataset has 5195 rows, 5 columns
 Train Dataset has 12120 rows, 6 columns


In [4]:
[print(f'Submission Sample has {len(sub_sample[col].unique())} unique values for {col} column') for col in sub_sample.columns];

Submission Sample has 5195 unique values for id column
Submission Sample has 1 unique values for prediction column


In [5]:
[print(f'Train has {len(train[col].unique())} unique values for {col} column') for col in train.columns];

label_counts = pd.DataFrame(train['label'].value_counts())

for i in label_counts.index:
    print(f"\tTrain has {label_counts.loc[i, 'label']} observations for label {i}")

Train has 12120 unique values for id column
Train has 8209 unique values for premise column
Train has 12119 unique values for hypothesis column
Train has 15 unique values for lang_abv column
Train has 15 unique values for language column
Train has 3 unique values for label column
	Train has 4176 observations for label 0
	Train has 4064 observations for label 2
	Train has 3880 observations for label 1


In [6]:
[print(f'Test has {len(test[col].unique())} unique values for {col} column') for col in test.columns];

Test has 5195 unique values for id column
Test has 4336 unique values for premise column
Test has 5195 unique values for hypothesis column
Test has 15 unique values for lang_abv column
Test has 15 unique values for language column


# There are many languages inside Train and Test; therefore, in order to help the model stress less about solving for language, all observations will be translated to English. 

# Going one-by-one takes a long time (circa, 30 minutes per column, per dataset) so we'll throw this step into a multiprocessing pool.

In [7]:
scripts_directory = 'scripts'
if scripts_directory not in os.listdir():
    os.mkdir(scripts_directory)

In [8]:
%%writefile scripts/EngTranslator.py

from time import sleep
from random import randint
from googletrans import Translator, constants
tr = Translator()

def toEng(statement = 'text to translate', lan = 'language abbreviation'):
    if lan != 'en':
        sleep(randint(1, 10))
        return tr.translate(statement, dest = 'en').text
    else:
        return statement

Overwriting scripts/EngTranslator.py


In [9]:
import multiprocessing
from scripts.EngTranslator import toEng

translation_directory = 'trans_datasets'
tr_train_file = 'trans_train.csv'

if translation_directory not in os.listdir():
    os.mkdir(transaltion_directory)

if tr_train_file not in os.listdir(translation_directory):

    data2translate = train.loc[:, ['premise', 'hypothesis', 'lang_abv']].values

    if __name__ == '__main__':
        pool = multiprocessing.Pool(processes = 50)
        r_premise = pool.starmap(toEng, tqdm([[p,l] for p, _, l in data2translate]))
        r_hypothesis = pool.starmap(toEng, tqdm([[h,l] for _, h, l in data2translate]))

    train['eng_premise'] = r_premise
    train['eng_hypothesis'] = r_hypothesis

    train.to_csv('/'.join([translation_directory, tr_train_file]), index = False)

else:
    train = pd.read_csv('/'.join([translation_directory, tr_train_file]))
    [print(f'Train has {len(train[col].unique())} unique values for {col} column') for col in train.columns];

    label_counts = pd.DataFrame(train['label'].value_counts())

    for i in label_counts.index:
        print(f"\tTrain has {label_counts.loc[i, 'label']} observations for label {i}")

Train has 12120 unique values for Unnamed: 0 column
Train has 12120 unique values for id column
Train has 8209 unique values for premise column
Train has 12119 unique values for hypothesis column
Train has 15 unique values for lang_abv column
Train has 15 unique values for language column
Train has 3 unique values for label column
Train has 8146 unique values for eng_premise column
Train has 12119 unique values for eng_hypothesis column
	Train has 4176 observations for label 0
	Train has 4064 observations for label 2
	Train has 3880 observations for label 1


In [10]:
tr_test_file = 'trans_test.csv'

if tr_test_file not in os.listdir(translation_directory):
    
    data2translate = test.loc[:, ['premise', 'hypothesis', 'lang_abv']].values

    if __name__ == '__main__':
        pool = multiprocessing.Pool(processes = 50)
        r_premise = pool.starmap(toEng, tqdm([[p,l] for p, _, l in data2translate]))
        r_hypothesis = pool.starmap(toEng, tqdm([[h,l] for _, h, l in data2translate]))

    test['eng_premise'] = r_premise
    test['eng_hypothesis'] = r_hypothesis
    
    test.to_csv('/'.join([translation_directory, tr_test_file]), index = False)

else:
    test = pd.read_csv('/'.join([translation_directory, tr_test_file]))
    [print(f'Test has {len(test[col].unique())} unique values for {col} column') for col in test.columns];

Test has 5195 unique values for Unnamed: 0 column
Test has 5195 unique values for id column
Test has 4336 unique values for premise column
Test has 5195 unique values for hypothesis column
Test has 15 unique values for lang_abv column
Test has 15 unique values for language column
Test has 4329 unique values for eng_premise column
Test has 5195 unique values for eng_hypothesis column


In [31]:

cols = ['premise', 'hypothesis', 'eng_premise', 'eng_hypothesis']

print('*'*10, 'Train', '*'*10)
for col in cols:
    string = ' '.join([train.loc[i, col] for i in train.index])
    df = pd.DataFrame(Counter(string.split()).most_common(), columns = ['word', 'count'])
    print(f"{col} has {df.shape[0]} unique words in a {df['count'].sum()} corpus")

print('*'*10, 'Test', '*'*10)
for col in cols:
    string = ' '.join([test.loc[i, col] for i in test.index])
    df = pd.DataFrame(Counter(string.split()).most_common(), columns = ['word', 'count'])
    print(f"{col} has {df.shape[0]} unique words in a {df['count'].sum()} corpus")

********** Train **********
premise has 50291 unique words in a 218041 corpus
hypothesis has 33810 unique words in a 111447 corpus
eng_premise has 25008 unique words in a 225365 corpus
eng_hypothesis has 18220 unique words in a 115934 corpus
********** Test **********
premise has 30007 unique words in a 93621 corpus
hypothesis has 18523 unique words in a 47790 corpus
eng_premise has 17458 unique words in a 96522 corpus
eng_hypothesis has 11254 unique words in a 49687 corpus
