In [1]:
import torch
import pandas as pd
import numpy as np
import sentencepiece as spm
import ast
from datasets import load_dataset
import tarfile
import requests
import zipfile
import os
import re
import pickle
from tqdm import tqdm
import csv
import itertools
from sklearn.model_selection import train_test_split

In [16]:
class NLITRReader(torch.utils.data.Dataset): # from the original NLI-TR repository
    def __init__(self, dataset_name, main_dataset, split_name, max_example_num=-1):
        self.dataset_tr = load_dataset("nli_tr", dataset_name, split=split_name, trust_remote_code=True)
        self.dataset_en = load_dataset(main_dataset, split=split_name)
        self.max_example_num = max_example_num

    def read(self):
        count = 0
        for example_tr, example_en in zip(self.dataset_tr, self.dataset_en):
            if example_tr["label"] == -1:  # skip examples having no gold value.
                continue
            count += 1
            if self.max_example_num > 0 and count >= self.max_example_num:
                break
            yield example_tr, example_en

In [17]:
examples = []

for i in ["train", "validation", "test"]:
    try:
        reader = NLITRReader("snli_tr", "snli", i)
        ex = list(reader.read())
        examples.extend(ex)
    except Exception as e:
        print(f"Failed for {i}: {e}")

In [18]:
len(examples)

569033

In [19]:
ds = load_dataset("canbingol/translate_dataset")
len(ds["train"]) + len(ds["validation"])

Repo card metadata block was not found. Setting CardData to empty.


99999

In [20]:
class NLITRReader2(torch.utils.data.Dataset): # from the original NLI-TR repository
    def __init__(self, main_dataset, split_name, file_path, max_example_num=-1):
        self.dataset_tr = pd.read_json(file_path)
        self.dataset_en = load_dataset(main_dataset, split=split_name)
        self.max_example_num = max_example_num

        self.en_dict = {example['pairID']: example for example in self.dataset_en}

    def read(self):
        for example_tr in self.dataset_tr.itertuples():
            pair_id_tr = example_tr.pairID
            
            if pair_id_tr in self.en_dict:
                example_en = self.en_dict[pair_id_tr]

                yield example_tr.sentence1, example_en["premise"]
                yield example_tr.sentence2, example_en["hypothesis"]

In [21]:
examples2 = []

for j in [("validation_matched", "multinli_tr/multinli_tr_1.1_dev_matched.json"), ("validation_mismatched", "multinli_tr/multinli_tr_1.1_dev_mismatched.json"), ("train", "multinli_tr/multinli_tr_1.1_train.json")]:
    try:
        reader = NLITRReader2("nyu-mll/multi_nli", j[0], j[1])
        ex = list(reader.read())
        examples2.extend(ex)
    except Exception as e:
        print(f"Failed for ({j}): {e}")

In [22]:
len(examples2)

824698

In [23]:
MIN_WORDS = 1

data_label_pairs = []

sentences = {}
with tarfile.open("tatoeba/sentences.tar.bz2", "r:bz2") as tar:
    csv_name = [m.name for m in tar.getmembers() if m.isfile()][0]
    with tar.extractfile(csv_name) as f:
        for line in f:
            parts = line.decode("utf-8").strip().split('\t')
            if len(parts) >= 3 and parts[1] in {"eng", "tur"}:
                sentences[parts[0]] = {"text": parts[2], "lang": parts[1]}



pairs = []
with tarfile.open("tatoeba/links.tar.bz2", "r:bz2") as tar:
    csv_name = [m.name for m in tar.getmembers() if m.isfile()][0]
    with tar.extractfile(csv_name) as f:
        for line in f:
            id1, id2 = line.decode("utf-8").strip().split("\t")
            if id1 in sentences and id2 in sentences:
                s1 = sentences[id1]
                s2 = sentences[id2]
                
                if {s1["lang"], s2["lang"]} == {"eng", "tur"}:
                    words1 = len(s1["text"].split())
                    words2 = len(s2["text"].split())
                    
                    if words1 >= MIN_WORDS:
                        if s1["lang"] == "eng":
                            pairs.append((s1["text"], s2["text"]))
                        else:
                            pairs.append((s2["text"], s1["text"]))

In [24]:
len(pairs)

1424304

In [25]:
url = "https://object.pouta.csc.fi/OPUS-Wikipedia/v1.0/moses/en-tr.txt.zip"
zip_path = "opus-wikipedia/en-tr.txt.zip"

if not os.path.exists(zip_path):
    print("Downloading...")
    response = requests.get(url, stream=True)
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")
else:
    print("File already exists - using cached version")

File already exists - using cached version


In [26]:
wiki_pairs = []

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    with zip_file.open("Wikipedia.en-tr.en") as en_file, zip_file.open("Wikipedia.en-tr.tr") as tr_file:
        for en_line, tr_line in zip(en_file, tr_file):
            try:
                en_text = en_line.decode("utf-8").strip()
                tr_text = tr_line.decode("utf-8").strip()

                wiki_pairs.append((en_text, tr_text))
                
            except UnicodeDecodeError:
                continue  # skip malformed lines

len(wiki_pairs)

159979

In [27]:
url = "https://object.pouta.csc.fi/OPUS-MultiHPLT/v2/moses/en-tr.txt.zip"
zip_path = "opus-multihplt/en-tr.txt.zip"

if not os.path.exists(zip_path):
    print("Downloading...")
    response = requests.get(url, stream=True)
    with open(zip_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")
else:
    print("File already exists - using cached version")

File already exists - using cached version


In [28]:
multihplt_pairs = []

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    with zip_file.open("MultiHPLT.en-tr.en") as en_file, zip_file.open("MultiHPLT.en-tr.tr") as tr_file:
        for en_line, tr_line in zip(en_file, tr_file):
            try:
                en_text = en_line.decode("utf-8").strip()
                tr_text = tr_line.decode("utf-8").strip()

                en_text = re.sub(r'<[^>]+>', '', en_text)  # remove HTML tags
                en_text = re.sub(r'\s+', ' ', en_text) # normalize whitespace
                
                tr_text = re.sub(r'<[^>]+>', '', tr_text)
                tr_text = re.sub(r'\s+', ' ', tr_text)
                
                # Apply length filter
                if len(en_text.split()) >= MIN_WORDS and len(tr_text.split()) >= MIN_WORDS:
                    multihplt_pairs.append((en_text, tr_text))
                    
            except UnicodeDecodeError:
                continue  # Skip malformed lines

len(multihplt_pairs)

21616652

In [29]:
hand_pairs = []

with open("hand_translated/hand_translated.csv", "r", encoding="utf-8") as f:
    for line in f:
        try:
            parts = line.strip().split(",")
            if len(parts) == 2:
                eng = parts[0].strip()
                tur = parts[1].strip()

                hand_pairs.append([eng, tur])

                # Capitalized (first letter of each word)
                eng_cap = " ".join([w[0].upper() + w[1:] if w else "" for w in eng.split()])
                tur_cap = " ".join([w[0].upper() + w[1:] if w else "" for w in tur.split()])

                if eng_cap != eng or tur_cap != tur:
                    hand_pairs.append([eng_cap, tur_cap])
        except Exception as e:
            print("Error reading line:", e)
            continue

len(hand_pairs)

5049

In [31]:
for i in np.random.randint(0, 999, 1):
    print(examples[i])
    print(examples2[i])
    print(ds["train"][i:i+1])
    print(pairs[i])
    print(wiki_pairs[i])
    print(multihplt_pairs[i])
    print(hand_pairs[i])

({'idx': 551, 'premise': 'Beyaz bir bisiklet sokak tabelasına bağlanır.', 'hypothesis': 'bisiklet bir işaretti bağlı', 'label': 0}, {'premise': 'A white bike is tied to a street sign.', 'hypothesis': 'the bike is tied to a sign', 'label': 0})
('Bunu söylediğimi unutabileceğini sanmıyorum.', "'I don't suppose you could forget I ever said that?'")
{'id': [550], 'translation': [{'en': 'Evangeline Lilly is Canadian.', 'tr': 'Evangeline Lilly  Kanadalıdır.'}]}
("I'm undressing.", 'Ben soyunuyorum.')
('Aşağıkurudere is a village in the District of Emirdağ, Afyonkarahisar Province, Turkey.', 'Aşağıkurudere (eski adı Petera), Afyonkarahisar ilinin Emirdağ ilçesine bağlı bir köydür.')
('SHORTSEAPROMOTION CENTER TURKEY ACTION PLAN', 'KISA MESAFE DENİZ TAŞIMACILITANITIM MERKEZİ – TÜRKİYE EYLEM PLANI')
['during', 'sırasında']


In [None]:
data_label_pairs = []

for example_tr, example_en in examples:
    input_text_premise = example_en["premise"] # pair the premises and hypotheses
    label_text_premise = example_tr["premise"]
    data_label_pairs.append((input_text_premise, label_text_premise))
    
    input_text_hypothesis = example_en['hypothesis']
    label_text_hypothesis = example_tr['hypothesis']
    data_label_pairs.append((input_text_hypothesis, label_text_hypothesis))

for example_tr, example_en in examples2:
    data_label_pairs.append((example_en, example_tr))

for split in ["train", "validation"]:    
    for entry in ds["train"]:
        en_tr = entry["translation"]["en"]
        tr_tr = entry["translation"]["tr"]
        data_label_pairs.append((en_tr, tr_tr))

#for en_text, tr_text in pairs:
#    data_label_pairs.append((en_text, tr_text))
data_label_pairs += pairs

#for en_text, tr_text in wiki_pairs:
#    data_label_pairs.append((en_text, tr_text))
data_label_pairs += wiki_pairs
data_label_pairs += multihplt_pairs
data_label_pairs += hand_pairs

In [33]:
df = pd.DataFrame(data_label_pairs, columns=["input_text", "label_text"])
df = df.drop_duplicates().reset_index(drop=True) # remove duplicates since the same premise is repeated multiple times

In [34]:
pd.set_option("display.max_colwidth", None)
print(df.head())
print(len(df))

                                               input_text  \
0  A person on a horse jumps over a broken down airplane.   
1       A person is training his horse for a competition.   
2           A person is at a diner, ordering an omelette.   
3                       A person is outdoors, on a horse.   
4                   Children smiling and waving at camera   

                                             label_text  
0    Attaki bir kişi, bozuk bir uçağın üzerinden atlar.  
1                 Bir kişi atını yarışma için eğitiyor.  
2          Bir kişi bir lokantada omlet sipariş ediyor.  
3                    Bir kişi açık havada, at üzerinde.  
4  Fotoğraf makinesinde gülümseyen ve sallayan çocuklar  
23700223


In [35]:
print(df[-5:])

           input_text   label_text
23700218  Don't Worry  Endişelenme
23700219    worry not  endişelenme
23700220    Worry Not  Endişelenme
23700221      big boy  büyük çocuk
23700222      Big Boy  Büyük Çocuk


In [36]:
df.to_csv("data_sum.csv", index=False)

In [2]:
df = pd.read_csv("data_sum.csv")

In [3]:
df = df.astype(str)
len(df) # 23695835

23700223

In [4]:
df["input_text"].to_csv("english_corpus.txt", index=False, header=False)
df["label_text"].to_csv("turkish_corpus.txt", index=False, header=False) # save the English and Turkish corpus for training the SentencePiece model

In [None]:
# Training the sentencepiece model. We use byte-pair encoding with 50000 tokens for both languages.
# The character coverage is set to 0.9999 to ensure that most characters are included in the vocabulary, while hopefully filtering out some junk.
# These are hyperparameters that can be tuned.

#spm.SentencePieceTrainer.train(input='english_corpus.txt', model_prefix='en_spm', vocab_size=50000, character_coverage=0.9999, model_type="bpe", pad_id=0, unk_id=1, bos_id=2, eos_id=3)
#spm.SentencePieceTrainer.train(input='turkish_corpus.txt', model_prefix='tr_spm', vocab_size=50000, character_coverage=0.9999, model_type="bpe", pad_id=0, unk_id=1, bos_id=2, eos_id=3)

# commenting out so i dont accidentally overwrite the tokenizer models

In [5]:
en_tokenizer = spm.SentencePieceProcessor()
tr_tokenizer = spm.SentencePieceProcessor()

en_tokenizer.load("en_spm.model")
tr_tokenizer.load("tr_spm.model")

True

In [6]:
def process_column_memory_efficient(df, col_name, tokenizer, output_file, chunk_size=1000):
    if os.path.exists(output_file):
        return
    
    total_rows = len(df)
    
    with open(output_file, 'wb') as f:
        for start_idx in tqdm(range(0, total_rows, chunk_size), 
                          desc=f"Processing {col_name}"):
            end_idx = min(start_idx + chunk_size, total_rows)
            chunk = df[col_name].iloc[start_idx:end_idx]
            
            chunk_results = []
            for text in chunk:
                ids = tokenizer.encode(text, out_type=int)
                chunk_results.append([tokenizer.bos_id()] + ids + [tokenizer.eos_id()])
            
            pickle.dump(chunk_results, f)

process_column_memory_efficient(df, "input_text", en_tokenizer, "input_ids.pkl")
process_column_memory_efficient(df, "label_text", tr_tokenizer, "label_ids.pkl")

Processing input_text: 100%|██████████| 23701/23701 [20:43<00:00, 19.06it/s]
Processing label_text: 100%|██████████| 23701/23701 [22:38<00:00, 17.44it/s]


In [7]:
def save_to_readable_csv(input_pkl, label_pkl, output_csv):
    with open(input_pkl, 'rb') as f1, open(label_pkl, 'rb') as f2:
        def input_generator():
            while True:
                try:
                    yield from pickle.load(f1)
                except EOFError:
                    break
                    
        def label_generator():
            while True:
                try:
                    yield from pickle.load(f2)
                except EOFError:
                    break

        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["input_ids", "label_ids"])
            
            f1.seek(0, 2)
            f2.seek(0, 2)
            total_size = max(f1.tell(), f2.tell())
            f1.seek(0)
            f2.seek(0)
            
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Writing CSV") as pbar:
                for inp, lbl in zip(input_generator(), label_generator()):
                    writer.writerow([
                        ' '.join(map(str, inp)),
                        ' '.join(map(str, lbl))
                    ])
                    pbar.update(f1.tell() - pbar.n)

save_to_readable_csv("input_ids.pkl", "label_ids.pkl", "tokenized_data.csv")

Writing CSV:  96%|█████████▌| 1.85G/1.93G [06:57<00:18, 4.44MB/s]


In [9]:
df.head()

Unnamed: 0,input_text,label_text
0,A person on a horse jumps over a broken down a...,"Attaki bir kişi, bozuk bir uçağın üzerinden at..."
1,A person is training his horse for a competition.,Bir kişi atını yarışma için eğitiyor.
2,"A person is at a diner, ordering an omelette.",Bir kişi bir lokantada omlet sipariş ediyor.
3,"A person is outdoors, on a horse.","Bir kişi açık havada, at üzerinde."
4,Children smiling and waving at camera,Fotoğraf makinesinde gülümseyen ve sallayan ço...


In [10]:
df = pd.read_csv("tokenized_data.csv")[1000005:1000010]

def str_to_ids(s):
    return [int(x) for x in s.split()]

df["input_ids"] = df["input_ids"].apply(str_to_ids)
df["label_ids"] = df["label_ids"].apply(str_to_ids)

n = 5
df["in"] = df["input_ids"].apply(lambda ids: en_tokenizer.decode(ids))
df["out"] = df["label_ids"].apply(lambda ids: tr_tokenizer.decode(ids))

pd.set_option("display.max_colwidth", None)
print(df[["in", "input_ids", "out", "label_ids"]].head(n))

                                                                                                       in  \
1000005  Designing the wooden terrace was an architectural nightmare because of how high up it had to be.   
1000006                                           The conversation was about repainting her kitchen blue.   
1000007                                Four bronze horses were placed by the basilica over 100 years ago.   
1000008                                                                   I favor the second explanation.   
1000009                                                                  The first explanation is better.   

                                                                                                        input_ids  \
1000005  [2, 36928, 11, 6960, 4082, 297, 122, 11878, 40581, 1350, 35, 677, 550, 320, 149, 1035, 39, 93, 49548, 3]   
1000006                                      [2, 129, 8126, 297, 367, 1130, 171, 585, 1032, 2633, 3467, 49548, 

In [2]:
df = pd.read_csv("tokenized_data.csv")

df_train, df_val = train_test_split(df, test_size=0.02) # more than enough with the number of samples we have.
df_val.to_csv("tokenized_data_val.csv", index=False)
df_train.to_csv("tokenized_data_train.csv", index=False)

In [3]:
print(len(df_val), len(df_train))

474005 23226218


In [2]:
def shuffle_csv(input_path, output_path, seed=42):
    print(f"Reading the CSV from {input_path}...")
    df = pd.read_csv(input_path)

    print("Shuffling...")
    df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)

    print(f"Saving shuffled CSV to {output_path}...")
    df.to_csv(output_path, index=False)

    print("Done!")

shuffle_csv("tokenized_data_train.csv", "tokenized_data_train.csv")

Reading the CSV from tokenized_data_train.csv...
Shuffling...
Saving shuffled CSV to tokenized_data_train.csv...
Done!


In [3]:
df = pd.read_csv("tokenized_data_train.csv")
len(df)

23226218