https://www.kaggle.com/datasets/parthplc/ms-marco-dataset

In [1]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l- \ | done
[?25hCollecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.11.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore=

In [2]:
import os
import pandas as pd
from tqdm import tqdm
from googletrans import Translator
import pandas as pd
import time
tqdm.pandas()

translator = Translator()


def translate_batch(texts, src='en', dest='hr', wait_time=5):
    def translate_one(text):
        retries = 0
        while True:
            try:
                return translator.translate(text, src=src, dest=dest).text
            except Exception as e:
                retries += 1
                print(f"Warning: Translation failed. Retry {retries} in {wait_time} seconds...")
                time.sleep(wait_time)
    return [translate_one(t) for t in texts]

# Function to process a column in batches
def translate_column_in_batches(df, col, batch_size=50, output_dir="translations"):
    os.makedirs(output_dir, exist_ok=True) 
    translated_data = []

    # Split the column into batches
    num_batches = (len(df) + batch_size - 1) // batch_size  # Calculate total batches
    for batch_num in tqdm(range(num_batches), desc=f"Translating {col}"):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, len(df))
        batch = df[col].iloc[start_idx:end_idx]

        # Define batch file name
        batch_file = os.path.join(output_dir, f"{col}_batch_{batch_num}.csv")

        # Check if batch already exists
        if os.path.exists(batch_file):
            print(f"Loading batch {batch_num} from file: {batch_file}")
            batch_translated = pd.read_csv(batch_file)['translated'].tolist()
        else:
            # print(f"Translating batch {batch_num}...")
            batch_translated = translate_batch(batch.tolist())
            # Save the batch to a CSV file
            pd.DataFrame({'original': batch.tolist(), 'translated': batch_translated}).to_csv(batch_file, index=False)

        translated_data.extend(batch_translated)

    return translated_data

In [3]:
input_path = "/kaggle/input/ms-marco/train.csv"
output_path = "/kaggle/working/ms-marco-translated.csv"
batch_size = 10
output_tmp_dir = f"/kaggle/working/translations_{batch_size}"

df = pd.read_csv(input_path)
df = df[:10000]
df

Unnamed: 0,answers,query,finalpassage
0,"Kids who are bipolar, in their manic stages, v...",why do children get aggressive,"At the same time, despite claiming the review ..."
1,"Equifax, transunion and experian.",which credit bureau is used the most for auto ...,Best Answer: both of those answers are wrong. ...
2,"Women eat at least 1,200 calories daily and me...",what is the minimum healthy calorie intake,Safe Intakes. If you’re not supervised by a me...
3,Because Caffeine increases the stress hormone ...,why is coffee making gain weight,Is coffee making you fat? If you are overweigh...
4,Kent County,"what county is grand rapids, mi in","Located in Grand Rapids, Michigan, the 61st Di..."
...,...,...,...
9995,"Responsible organizers, driven to create and e...",what is a istj,A person with an ISTJ personality is more Intr...
9996,Saturn,which planet is colder saturn or neptune,"Saturn: Due to its distance from the Sun, Satu..."
9997,"A peace treaty between the five nations, Franc...",what were the terms of the treaty of versailles,"The Versailles Treaty, signed on June 28, 1919..."
9998,"The average weather temperature in big creek, ...","average weather temperature in big creek, ca?",Average Weather and Climate in November in Big...


In [4]:
columns_to_translate = ['finalpassage', 'query']

for col in columns_to_translate:
    print(f"Processing column: {col}")
    df[f'{col}_cro'] = translate_column_in_batches(df, col, batch_size=batch_size, output_dir=output_tmp_dir)

df.to_csv(output_path, index=False)
print("Translation completed and saved.")


Processing column: finalpassage


Translating finalpassage: 100%|██████████| 1000/1000 [3:29:53<00:00, 12.59s/it]


Processing column: query


Translating query: 100%|██████████| 1000/1000 [3:23:29<00:00, 12.21s/it]


Translation completed and saved.
