In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
torch.cuda.empty_cache()
import random

#Downloading Hugging Face Transformers

In [None]:
! pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
!pip install transformers torch
!pip install sentencepiece

#Importing required libs

In [None]:
from datasets import list_datasets, load_dataset, DatasetDict
from collections import Counter
from typing import List, Dict, Union, Callable, Any
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pprint import pprint
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


#Flag1:downloading the datasets from HuggingFace

In [None]:
#downloading the dataset
datasets = list_datasets()
snli_dataset = list_datasets(with_details=True)[datasets.index('snli')]
# pprint(snli_dataset.__dict__) 
snli_ds = load_dataset('snli')  #loading snli dataset
print(snli_ds)

In [None]:
print(f"👉 Dataset len(dataset): {len(snli_ds)}")
print("\n👉 First item 'dataset[0]':")
pprint(snli_ds['train'][0])

In [None]:
train_data = random.choices(snli_ds['train'], k=20000)
val_data = random.choices(snli_ds['validation'], k=4000)
test_data = random.choices(snli_ds['test'], k=4000)
# train_data_hyp = [ele['hypothesis'] for ele in train_data]
# train_data_pre = [ele['premise'] for ele in train_data]
# val_data_hyp = [ele['hypothesis'] for ele in val_data]
# val_data_pre = [ele['premise'] for ele in val_data]
# test_data_hyp = [ele['hypothesis'] for ele in test_data]
# test_data_pre = [ele['premise'] for ele in test_data]

In [None]:
len(train_data), len(val_data), len(val_data)

#Flag:2 English -> French Translations 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")

In [None]:
from torch.utils.data import DataLoader, Dataset

class SNLI_dataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    task_prefix = "translate English to French: "
    premise = task_prefix+ self.data[idx]['premise']
    hypothesis = task_prefix+self.data[idx]['hypothesis']
    label = self.data[idx]['label']
    premise_ids = self.tokenizer.encode(premise, max_length=300, padding='max_length', truncation=True)
    hypothesis_ids = self.tokenizer.encode(hypothesis, max_length=300, padding='max_length', truncation=True)
    label_id = torch.tensor(label, device=device)
    return torch.tensor(hypothesis_ids, device=device),torch.tensor(premise_ids, device=device),label

train_dataset = SNLI_dataset(train_data, tokenizer)
val_dataset = SNLI_dataset(val_data, tokenizer)
test_dataset = SNLI_dataset(test_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [None]:
next(iter(test_dataloader))

In [None]:
def translation_French_English(loader):
  english_text_hyp=[]
  english_text_pre=[]
  english_text_lables=[]
  for hyp, pre,labels in loader:
    hyp_tokens = model.generate(input_ids=hyp,max_length=300,num_beams=4,early_stopping=True)
    for i in range(0,len(hyp_tokens)):
      english_text_hyp.append(tokenizer.decode(hyp_tokens[i], skip_special_tokens=True))

    pre_tokens = model.generate(input_ids=pre,max_length=300,num_beams=4,early_stopping=True)
    for i in range(0,len(pre_tokens)):
      english_text_pre.append(tokenizer.decode(pre_tokens[i], skip_special_tokens=True))

    for i in range(0,len(labels)):
      english_text_lables.append(labels[i].item())
    

  return english_text_hyp, english_text_pre, english_text_lables

In [None]:
train_hyp, train_pre, train_label = translation_French_English(train_dataloader)
print("train translated")
val_hyp, val_pre, val_label = translation_French_English(val_dataloader)
print("val translated")
test_hyp, test_pre, test_label = translation_French_English(test_dataloader)
print("test translated")
train_hyp, train_pre, train_label

#Converting the translations to .csv files


In [None]:
filepath_train="/content/drive/MyDrive/snli/train.csv"
filepath_val="/content/drive/MyDrive/snli/val.csv"
filepath_test="/content/drive/MyDrive/snli/test.csv"

In [None]:
train_df = pd.DataFrame(list(zip(train_hyp, train_pre,train_label)),columns =['hypothesis', 'premise', 'label'])
val_df = pd.DataFrame(list(zip(val_hyp, val_pre,train_label)),columns =['hypothesis', 'premise', 'label'])
test_df = pd.DataFrame(list(zip(test_hyp, val_pre,train_label)),columns =['hypothesis', 'premise', 'label'])

In [None]:
train_df.to_csv(filepath_train,index=False)
val_df.to_csv(filepath_val,index=False)
test_df.to_csv(filepath_test,index=False)

In [None]:
train_ds = pd.read_csv(filepath_train)
val_ds = pd.read_csv(filepath_val)
test_ds = pd.read_csv(filepath_test)