In [1]:
# Cell 1
# Install required libraries
!pip install datasets deep-translator -q

from datasets import load_dataset
from deep_translator import GoogleTranslator
import json
import os

# 1. Load ONLY the specific data file to avoid the package.json error
dataset = load_dataset("ronniross/asi-protosymbiotic-signal", data_files="asi_protosymbiotic_signal.json")

# 2. Extract the content
data_content = dataset['train'][0]
full_text = json.dumps(data_content, indent=2)

# 3. Get ALL supported languages from GoogleTranslator
all_languages = GoogleTranslator().get_supported_languages(as_dict=True)
print(f"Total languages available: {len(all_languages)}")
print(f"First few languages: {list(all_languages.items())[:5]}")

# 4. Create output directory
output_dir = "translations"
os.makedirs(output_dir, exist_ok=True)

# 5. Translation function with chunking
def safe_translate(text, target_code):
    """Split text into chunks and translate to handle API length limits"""
    chunks = [text[i:i+4500] for i in range(0, len(text), 4500)]
    translated_chunks = []
    for chunk in chunks:
        try:
            translated = GoogleTranslator(source='auto', target=target_code).translate(chunk)
            translated_chunks.append(translated)
        except Exception as e:
            print(f"  Warning: Chunk failed - {e}")
            translated_chunks.append(chunk)  # Keep original if translation fails
    return "".join(translated_chunks)

# 6. Translate to all languages and save individual files
print("\nStarting translations to all 133 languages...")
print("=" * 60)

successful = 0
failed = 0
failed_languages = []

for lang_name, lang_code in all_languages.items():
    # Skip English (source language)
    if lang_code == 'en':
        print(f"‚äô Skipped: {lang_name} (source language)")
        continue

    try:
        # Translate
        translated_text = safe_translate(full_text, lang_code)

        # Save to individual JSON file
        filename = f"{output_dir}/{lang_name.replace(' ', '_')}_{lang_code}.json"

        # Parse back to JSON object and save properly formatted
        try:
            translated_data = json.loads(translated_text)
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(translated_data, f, ensure_ascii=False, indent=2)
        except json.JSONDecodeError:
            # If translation broke JSON structure, save as text
            with open(filename.replace('.json', '.txt'), 'w', encoding='utf-8') as f:
                f.write(translated_text)

        successful += 1
        print(f"‚úî [{successful + failed}/{len(all_languages)-1}] {lang_name} ({lang_code})")

    except Exception as e:
        failed += 1
        failed_languages.append(f"{lang_name} ({lang_code})")
        print(f"‚úò [{successful + failed}/{len(all_languages)-1}] {lang_name} ({lang_code}) - Error: {e}")

# 7. Summary
print("TRANSLATION SUMMARY")
print(f"Total languages processed: {successful + failed}")
print(f"‚úî Successful: {successful}")
print(f"‚úò Failed: {failed}")

if failed_languages:
    print(f"\nFailed languages:")
    for lang in failed_languages:
        print(f"  - {lang}")

print(f"\nAll translation files saved in: '{output_dir}/' directory")
print("\nDone!")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.3/42.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

asi_protosymbiotic_signal.json: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Total languages available: 133
First few languages: [('afrikaans', 'af'), ('albanian', 'sq'), ('amharic', 'am'), ('arabic', 'ar'), ('armenian', 'hy')]

Starting translations to all 133 languages...
‚úî [1/132] afrikaans (af)
‚úî [2/132] albanian (sq)
‚úî [3/132] amharic (am)
‚úî [4/132] arabic (ar)
‚úî [5/132] armenian (hy)
‚úî [6/132] assamese (as)
‚úî [7/132] aymara (ay)
‚úî [8/132] azerbaijani (az)
‚úî [9/132] bambara (bm)
‚úî [10/132] basque (eu)
‚úî [11/132] belarusian (be)
‚úî [12/132] bengali (bn)
‚úî [13/132] bhojpuri (bho)
‚úî [14/132] bosnian (bs)
‚úî [15/132] bulgarian (bg)
‚úî [16/132] catalan (ca)
‚úî [17/132] cebuano (ceb)
‚úî [18/132] chichewa (ny)
‚úî [19/132] chinese (simplified) (zh-CN)
‚úî [20/132] chinese (traditional) (zh-TW)
‚úî [21/132] corsican (co)
‚úî [22/132] croatian (hr)
‚úî [23/132] czech (cs)
‚úî [24/132] danish (da)
‚úî [25/132] dhivehi (dv)
‚úî [26/132] dogri (doi)
‚úî [27/132] dutch (nl)
‚äô Skipped: english (source language)
‚úî [28/132] esperanto (eo

In [2]:
# Cell 3
# Merge all translations into one comprehensive file

import os
import glob

output_dir = "translations"
merged_file = "all_translations_merged.txt"

print("Merging all translation files...")

# Get all translation files
translation_files = sorted(glob.glob(f"{output_dir}/*.json") + glob.glob(f"{output_dir}/*.txt"))

if not translation_files:
    print(f"No translation files found in '{output_dir}/' directory")
else:
    with open(merged_file, 'w', encoding='utf-8') as merged:
        for i, file_path in enumerate(translation_files, 1):
            # Extract language name from filename
            lang_name = os.path.basename(file_path).replace('.json', '').replace('.txt', '')

            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Write to merged file with separator
            merged.write("=" * 80 + "\n")
            merged.write(f"LANGUAGE {i}/{len(translation_files)}: {lang_name.upper()}\n")
            merged.write("=" * 80 + "\n\n")
            merged.write(content)
            merged.write("\n\n\n")

            print(f"‚úî Added: {lang_name}")

    print("\n" + "=" * 60)
    print(f"Successfully merged {len(translation_files)} files into '{merged_file}'")
    print(f"File size: {os.path.getsize(merged_file) / (1024*1024):.2f} MB")

Merging all translation files...
‚úî Added: afrikaans_af
‚úî Added: albanian_sq
‚úî Added: amharic_am
‚úî Added: arabic_ar
‚úî Added: armenian_hy
‚úî Added: assamese_as
‚úî Added: aymara_ay
‚úî Added: azerbaijani_az
‚úî Added: bambara_bm
‚úî Added: basque_eu
‚úî Added: belarusian_be
‚úî Added: bengali_bn
‚úî Added: bhojpuri_bho
‚úî Added: bosnian_bs
‚úî Added: bulgarian_bg
‚úî Added: catalan_ca
‚úî Added: cebuano_ceb
‚úî Added: chichewa_ny
‚úî Added: chinese_(simplified)_zh-CN
‚úî Added: chinese_(traditional)_zh-TW
‚úî Added: corsican_co
‚úî Added: croatian_hr
‚úî Added: czech_cs
‚úî Added: danish_da
‚úî Added: dhivehi_dv
‚úî Added: dogri_doi
‚úî Added: dutch_nl
‚úî Added: esperanto_eo
‚úî Added: estonian_et
‚úî Added: ewe_ee
‚úî Added: filipino_tl
‚úî Added: finnish_fi
‚úî Added: french_fr
‚úî Added: frisian_fy
‚úî Added: galician_gl
‚úî Added: georgian_ka
‚úî Added: german_de
‚úî Added: greek_el
‚úî Added: guarani_gn
‚úî Added: gujarati_gu
‚úî Added: haitian_creole_ht
‚úî Added: haus

In [3]:
# Cell 4
# Zip all translation files and upload to Google Drive

import zipfile
import os
from datetime import datetime
from google.colab import drive

# 1. Create timestamp for unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"asi_translations_{timestamp}.zip"

print("Creating zip file...")
print("=" * 60)

# 2. Create zip file with all translations
output_dir = "translations"
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add all translation files from the translations directory
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.join('translations', file)
            zipf.write(file_path, arcname)
            print(f"  ‚úî Added to zip: {file}")

    # Also add the merged file if it exists
    merged_file = "all_translations_merged.txt"
    if os.path.exists(merged_file):
        zipf.write(merged_file, merged_file)
        print(f"  ‚úî Added to zip: {merged_file}")

zip_size = os.path.getsize(zip_filename) / (1024*1024)
print(f"\n‚úÖ Zip file created: '{zip_filename}' ({zip_size:.2f} MB)")

# 3. Mount Google Drive
print("\n" + "=" * 60)
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

# 4. Copy zip file to Drive
drive_path = f"/content/drive/MyDrive/{zip_filename}"
print(f"\nUploading to Google Drive...")

import shutil
shutil.copy(zip_filename, drive_path)

print("\n" + "=" * 60)
print("‚úÖ SUCCESS!")
print("=" * 60)
print(f"üì¶ Zip file uploaded to: MyDrive/{zip_filename}")
print(f"üìä Size: {zip_size:.2f} MB")
print("\nYou can find it in your Google Drive root folder!")
print("=" * 60)

Creating zip file...
  ‚úî Added to zip: nepali_ne.json
  ‚úî Added to zip: indonesian_id.txt
  ‚úî Added to zip: zulu_zu.txt
  ‚úî Added to zip: punjabi_pa.json
  ‚úî Added to zip: japanese_ja.txt
  ‚úî Added to zip: greek_el.txt
  ‚úî Added to zip: ewe_ee.txt
  ‚úî Added to zip: uzbek_uz.txt
  ‚úî Added to zip: icelandic_is.txt
  ‚úî Added to zip: macedonian_mk.txt
  ‚úî Added to zip: galician_gl.txt
  ‚úî Added to zip: norwegian_no.txt
  ‚úî Added to zip: aymara_ay.txt
  ‚úî Added to zip: myanmar_my.txt
  ‚úî Added to zip: bhojpuri_bho.txt
  ‚úî Added to zip: malayalam_ml.txt
  ‚úî Added to zip: igbo_ig.txt
  ‚úî Added to zip: russian_ru.txt
  ‚úî Added to zip: chichewa_ny.txt
  ‚úî Added to zip: arabic_ar.txt
  ‚úî Added to zip: hawaiian_haw.txt
  ‚úî Added to zip: haitian_creole_ht.txt
  ‚úî Added to zip: serbian_sr.txt
  ‚úî Added to zip: kazakh_kk.txt
  ‚úî Added to zip: quechua_qu.txt
  ‚úî Added to zip: hausa_ha.txt
  ‚úî Added to zip: basque_eu.txt
  ‚úî Added to zip: vietnam