In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install the required packages if they are not already installed.
!pip install datasets transformers


Mounted at /content/drive
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fssp

In [None]:
# --- Prerequisites ---
# 1. Ensure you are using a GPU runtime in Colab (recommended for faster detection).
# 2. Ensure your .jsonl files are in the specified folder on Google Drive.
# 3. Set the folder path and the file index range in Step 3 below.

# --- Step 0: Install necessary packages ---
!pip install langdetect transformers torch accelerate datasets sentencepiece -q
print("Required packages installed.")

# --- Step 1: Import required libraries ---
from collections import Counter
import os
import io
from google.colab import drive
import datetime
import torch
import gc
import logging
import json # For manual JSON parsing

# Added for Hugging Face Transformers & Datasets
import transformers
import datasets
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset, load_dataset

# --- Configure Logging ---
logging.getLogger("transformers").setLevel(logging.WARNING)

print("Libraries imported.")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

# --- Step 2: Mount Google Drive ---
print("\nMounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    raise

# --- Step 3: Configuration ---
jsonl_folder_path = '/content/drive/MyDrive/unzipped_longeval_abstracts' # <--- SET FOLDER PATH
start_file_index = 21
end_file_index = 21
filename_prefix = "documents_"
filename_suffix = ".jsonl"

# --- Parameters ---
map_batch_size = 128
lang_id_pipeline_batch_size = 128
max_length_langid = 512

# --- Step 4: Language Detection Model/Pipeline Setup (Load Once) ---
print("\n--- Setting up Language Detection Model (Loading Once) ---")
start_time_setup_langid = datetime.datetime.now()
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU detected. Using device: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()
    gc.collect()
else:
    device = torch.device("cpu")
    print("Warning: GPU not detected. Using CPU (might be slow).")

lang_id_model_name = "papluca/xlm-roberta-base-language-detection"
lang_id_pipeline = None
lang_id_model = None
lang_id_tokenizer = None
try:
    print(f"Loading LangID model '{lang_id_model_name}'...")
    lang_id_tokenizer = AutoTokenizer.from_pretrained(lang_id_model_name)
    lang_id_model = AutoModelForSequenceClassification.from_pretrained(lang_id_model_name).to(device)
    lang_id_pipeline = pipeline("text-classification", model=lang_id_model, tokenizer=lang_id_tokenizer, device=0 if device.type == 'cuda' else -1)
    print("LangID Pipeline loaded successfully.")
except Exception as e:
    print(f"Error loading LangID model: {e}")
    raise

end_time_setup_langid = datetime.datetime.now()
print(f"Time taken for LangID setup: {end_time_setup_langid - start_time_setup_langid}")


# --- Step 5: Loop Through Files and Process ---

print(f"\n--- Starting Processing Loop for Files {start_file_index} to {end_file_index} ---")

for i in range(start_file_index, end_file_index + 1):
    current_filename = f"{filename_prefix}{i:06d}{filename_suffix}"
    current_file_path = os.path.join(jsonl_folder_path, current_filename)

    print(f"\n===== Processing File: {current_filename} =====")

    # --- 5.1: Load Data (Manual Validation for 'abstract', skips None/non-string) ---
    print(f"\n--- Attempting Manual JSONL Validation for 'abstract' in: {current_filename} ---")

    valid_lines_data = [] # Store dicts like {'abstract': '...'}
    error_line_num = -1
    json_error_detail = None
    line_content_on_error = ""
    skipped_null_abstract_count = 0
    skipped_non_string_abstract_count = 0
    skipped_missing_abstract_count = 0
    ds = None

    try:
        if not os.path.exists(current_file_path):
            raise FileNotFoundError(f"File not found at path: {current_file_path}")

        print(f"   Opening file: {current_file_path} with encoding='utf-8'")
        with open(current_file_path, 'r', encoding='utf-8') as f:
            for line_idx, line in enumerate(f):
                line_num = line_idx + 1
                stripped_line = line.strip()
                if not stripped_line:
                    continue # Skip empty lines
                try:
                    json_obj = json.loads(stripped_line)

                    # --- Handle 'abstract' field ---
                    if 'abstract' not in json_obj:
                        # print(f"   ---> Skipping line {line_num}: Missing 'abstract' key.") # Optional verbose output
                        skipped_missing_abstract_count += 1
                        continue # Skip this line
                    elif json_obj['abstract'] is None:
                        # print(f"   ---> Skipping line {line_num}: 'abstract' is null/None.") # Optional verbose output
                        skipped_null_abstract_count += 1
                        continue # ***** CHANGE: Skip line instead of stopping *****
                    elif not isinstance(json_obj['abstract'], str):
                        # print(f"   ---> Skipping line {line_num}: 'abstract' is not a string (Type: {type(json_obj['abstract'])}).") # Optional verbose output
                        skipped_non_string_abstract_count += 1
                        continue # ***** CHANGE: Skip line instead of stopping *****
                    elif not json_obj['abstract'].strip():
                         # Also skip if abstract is an empty string or only whitespace
                         # print(f"   ---> Skipping line {line_num}: 'abstract' is empty or whitespace.") # Optional verbose output
                         skipped_null_abstract_count += 1 # Count it with nulls/empties
                         continue

                    # If abstract is a valid, non-empty string, store it
                    valid_lines_data.append({'abstract': json_obj['abstract']})

                except json.JSONDecodeError as e:
                    # Still treat JSON errors as fatal for the file
                    error_line_num = line_num
                    json_error_detail = e
                    line_content_on_error = stripped_line[:200] + "..."
                    print(f"   !!! Invalid JSON detected on line {error_line_num}: {e} !!!")
                    print(f"       Content preview: {line_content_on_error}")
                    break # Stop validation on first JSON error

    except FileNotFoundError:
        print(f"!!! File Not Found: {current_file_path}. Skipping. !!!")
        del valid_lines_data[:]
        gc.collect()
        continue
    except Exception as e:
        print(f"!!! Error reading file {current_file_path}: {e}. Skipping. !!!")
        del valid_lines_data[:]
        gc.collect()
        continue

    # --- Print skip counts ---
    print(f"   Skipped {skipped_missing_abstract_count} lines due to missing 'abstract' key.")
    print(f"   Skipped {skipped_null_abstract_count} lines due to null/empty/whitespace 'abstract'.")
    print(f"   Skipped {skipped_non_string_abstract_count} lines due to non-string 'abstract'.")

    # --- Check results of manual validation ---
    if error_line_num != -1:
        print(f"--- Manual validation failed for {current_filename} due to JSON error on line {error_line_num} ({json_error_detail}). Cannot proceed with this file. ---")
        del valid_lines_data[:]
        gc.collect()
        continue
    elif not valid_lines_data:
         # Handles case where file ONLY contained invalid abstracts or was empty after skipping
         print(f"--- No valid abstracts found to process in {current_filename} after skipping invalid lines. Skipping file processing. ---")
         gc.collect()
         continue
    else:
        # --- If validation found valid abstracts, create Dataset object ---
        print(f"--- Manual validation completed for {current_filename}. Found {len(valid_lines_data)} valid abstracts to process. ---")
        start_time_load = datetime.datetime.now()
        try:
            ds = Dataset.from_list(valid_lines_data)
            print(f"   Dataset object created successfully from validated abstracts for '{current_filename}'.")
            if 'abstract' not in ds.column_names:
                 raise ValueError("'abstract' column unexpectedly missing after Dataset.from_list")

        except Exception as e:
            print(f"!!! Error creating Dataset object from validated abstracts for '{current_filename}': {e}. Skipping. !!!")
            if 'ds' in locals() and ds is not None: del ds
            del valid_lines_data[:]
            gc.collect()
            continue

        end_time_load = datetime.datetime.now()
        del valid_lines_data[:]
        gc.collect()

        if ds is None:
            print(f"!!! Dataset object is None after creation attempt for '{current_filename}'. Skipping processing. !!!")
            continue

        print(ds)
        print(f"Time taken for creating Dataset object for '{current_filename}': {end_time_load - start_time_load}\n")

    # === If we reach here, 'ds' is valid and contains the 'abstract' column ===

    # --- 5.2: Language Detection ---
    print(f"\n--- Running Language Detection on Abstracts for {current_filename} ---")

    def detect_language_batch(batch):
        texts = batch["abstract"]
        detected_lang = [None] * len(texts)
        non_empty_texts = []
        non_empty_indices = []
        # We already skipped null/empty strings during validation, but double check here just in case
        for i_batch, text in enumerate(texts):
            if text is None or not isinstance(text, str) or text.strip() == "":
                detected_lang[i_batch] = "empty" # Should ideally not happen if validation worked
            else:
                non_empty_texts.append(text)
                non_empty_indices.append(i_batch)
        if non_empty_texts:
            try:
                predictions = lang_id_pipeline(
                    non_empty_texts,
                    batch_size=lang_id_pipeline_batch_size,
                    truncation=True, padding=True, max_length=max_length_langid
                )
                for idx, pred in zip(non_empty_indices, predictions):
                    detected_lang[idx] = pred["label"]
            except Exception as e_pipe:
                print(f"  Error in lang_id_pipeline during batch for {current_filename}: {e_pipe}")
                for idx in non_empty_indices:
                    detected_lang[idx] = "error"
        batch["detected_lang"] = detected_lang
        return batch

    start_time_map_langid = datetime.datetime.now()
    detected_ds = None
    try:
        detected_ds = ds.map(detect_language_batch, batched=True, batch_size=map_batch_size, num_proc=1)
        end_time_map_langid = datetime.datetime.now()
        print(f"Language detection for '{current_filename}' completed in {end_time_map_langid - start_time_map_langid}")
    except Exception as e:
        print(f"!!! Error during language detection map for '{current_filename}': {e}. Skipping statistics for this file. !!!")
        detected_ds = None

    # --- 5.3: Display Language Statistics ---
    print(f"\n--- Language Statistics for 'abstract' field in: {current_filename} ---")
    if detected_ds is not None and 'detected_lang' in detected_ds.column_names:
        try:
            language_counts = Counter(detected_ds["detected_lang"])
            total_items = len(detected_ds) # This count now only includes lines with valid abstracts
            print(f"Total valid abstracts processed in {current_filename}: {total_items}") # Updated label
            print("--- Language Distribution ---")
            if total_items > 0:
                for lang, count in language_counts.most_common():
                    percentage = (count / total_items) * 100
                    print(f"  {lang}: {count} ({percentage:.2f}%)")
            else:
                print("  No valid abstracts were processed.")

            # Summary Stats reflect only the processed items
            en_count = language_counts.get('en', 0)
            # 'empty' count here refers to abstracts that became empty after pipeline processing,
            # not the ones skipped earlier. Should be low/zero if validation worked.
            empty_count = language_counts.get('empty', 0)
            error_count = language_counts.get('error', 0)
            non_english_count_detected = total_items - en_count - empty_count - error_count

            print("\n--- Summary (based on processed abstracts) ---")
            print(f"English ('en'): {en_count}")
            print(f"Non-English (excluding empty/error): {non_english_count_detected}")
            print(f"Pipeline Processing Errors ('error'): {error_count}")
            # Optional: Could add the skipped counts here for full context
            # print(f"(Skipped lines: Missing={skipped_missing_abstract_count}, Null/Empty={skipped_null_abstract_count}, Non-String={skipped_non_string_abstract_count})")

        except Exception as e:
            print(f"!!! Error calculating statistics for '{current_filename}': {e} !!!")
    else:
        print(f"Could not calculate statistics for '{current_filename}' - detection may have failed or no valid abstracts were found.")

    # --- 5.4: Clean up memory for the current file ---
    print(f"--- Cleaning up memory after processing {current_filename} ---")
    if 'ds' in locals() and ds is not None: del ds
    if 'detected_ds' in locals() and detected_ds is not None: del detected_ds
    gc.collect()

# --- End of Loop ---
print("\n===== Finished Processing All Files =====")


# --- Step 6: Final Cleanup (Model) ---
print("\n--- Final Cleanup (Language Detection Model) ---")
del lang_id_pipeline
del lang_id_model
del lang_id_tokenizer
gc.collect()
if device.type == 'cuda':
    torch.cuda.empty_cache()
print("Cleanup complete.")

print("\n--- Script Finished ---")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m655.4/981.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cuda:0


LangID Pipeline loaded successfully.
Time taken for LangID setup: 0:00:09.520831

--- Starting Processing Loop for Files 21 to 21 ---

===== Processing File: documents_000021.jsonl =====

--- Attempting Manual JSONL Validation for 'abstract' in: documents_000021.jsonl ---
   Opening file: /content/drive/MyDrive/unzipped_longeval_abstracts/documents_000021.jsonl with encoding='utf-8'
   Skipped 0 lines due to missing 'abstract' key.
   Skipped 2617 lines due to null/empty/whitespace 'abstract'.
   Skipped 0 lines due to non-string 'abstract'.
--- Manual validation completed for documents_000021.jsonl. Found 11648 valid abstracts to process. ---
   Dataset object created successfully from validated abstracts for 'documents_000021.jsonl'.
Dataset({
    features: ['abstract'],
    num_rows: 11648
})
Time taken for creating Dataset object for 'documents_000021.jsonl': 0:00:00.132599


--- Running Language Detection on Abstracts for documents_000021.jsonl ---


Map:   0%|          | 0/11648 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Language detection for 'documents_000021.jsonl' completed in 0:02:45.989820

--- Language Statistics for 'abstract' field in: documents_000021.jsonl ---
Total valid abstracts processed in documents_000021.jsonl: 11648
--- Language Distribution ---
  en: 9607 (82.48%)
  es: 496 (4.26%)
  pt: 369 (3.17%)
  nl: 216 (1.85%)
  ru: 161 (1.38%)
  fr: 148 (1.27%)
  vi: 120 (1.03%)
  sw: 99 (0.85%)
  it: 80 (0.69%)
  ur: 58 (0.50%)
  tr: 55 (0.47%)
  ja: 42 (0.36%)
  de: 36 (0.31%)
  th: 34 (0.29%)
  el: 31 (0.27%)
  zh: 29 (0.25%)
  hi: 19 (0.16%)
  pl: 19 (0.16%)
  ar: 17 (0.15%)
  bg: 12 (0.10%)

--- Summary (based on processed abstracts) ---
English ('en'): 9607
Non-English (excluding empty/error): 2041
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000021.jsonl ---

===== Finished Processing All Files =====

--- Final Cleanup (Language Detection Model) ---
Cleanup complete.

--- Script Finished ---


In [None]:
# --- Prerequisites ---
# 1. Ensure you are using a GPU runtime in Colab.
# 2. Ensure your .jsonl files are already extracted into a folder on Google Drive.
# 3. Set the folder path and specific filename in Step 3 below.

# --- Step 0: Install necessary packages ---
!pip install langdetect transformers torch accelerate datasets sentencepiece -q
print("Required packages installed.")

# --- Step 1: Import required libraries ---
import json
from collections import Counter, defaultdict
import os
import io
from google.colab import drive
# *** ADDED files FOR DOWNLOAD ***
from google.colab import files
import datetime
import torch
import gc # Garbage Collector interface
import logging # For cleaner logs

# Added for Hugging Face Transformers & Datasets
import transformers
import datasets
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, load_dataset

# --- Configure Logging ---
logging.getLogger("transformers").setLevel(logging.WARNING)

print("Libraries imported.")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

# --- Step 2: Mount Google Drive ---
print("\nMounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# --- Step 3: Configuration ---
# !!! 1. Set this path to the FOLDER containing the extracted .jsonl files !!!
jsonl_folder_path = '/content/drive/MyDrive/unzipped_longeval_abstracts' # <--- SET FOLDER PATH

# !!! 2. Set the EXACT FILENAME of the single .jsonl file you want to process !!!
specific_filename_to_process = "documents_000021.jsonl" # <--- SET EXACT FILENAME

# --- Parameters ---
map_batch_size = 128
lang_id_pipeline_batch_size = 256
translation_pipeline_batch_size = 128 # Keep or adjust based on VRAM

# --- Step 4: Language Detection Model/Pipeline Setup ---
print("\n--- Setting up Language Detection ---")
start_time_setup_langid = datetime.datetime.now()
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU detected. Using device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Warning: GPU not detected. Using CPU.")

lang_id_model_name = "papluca/xlm-roberta-base-language-detection"
print(f"Loading LangID model '{lang_id_model_name}'...")
lang_id_pipeline = None
try:
    lang_id_tokenizer = AutoTokenizer.from_pretrained(lang_id_model_name)
    lang_id_model = AutoModelForSequenceClassification.from_pretrained(lang_id_model_name).to(device)
    lang_id_pipeline = pipeline("text-classification", model=lang_id_model, tokenizer=lang_id_tokenizer, device=0 if device.type == 'cuda' else -1)
    print("LangID Pipeline loaded successfully.")
except Exception as e:
     print(f"Error loading LangID model: {e}"); raise

end_time_setup_langid = datetime.datetime.now()
print(f"    Time taken for LangID setup: {end_time_setup_langid - start_time_setup_langid}")

# --- Step 5: Load Dataset ---
print("\n--- Loading Dataset ---")
data_files = os.path.join(jsonl_folder_path, specific_filename_to_process)

if not os.path.exists(data_files):
     raise FileNotFoundError(f"Input file not found: {data_files}")

start_time_load = datetime.datetime.now()
# Load 'id' and 'title' columns specifically
try:
    ds = load_dataset("json", data_files=data_files, split="train", features=datasets.Features({
        'id': datasets.Value('string'), # Keep ID if needed
        'title': datasets.Value('string') # <<<--- Load 'title'
        # Add other needed columns here, or comment out 'features' to load all
    }))
    print("Dataset loaded with specific features ('id', 'title').")
except Exception as e:
    print(f"Error loading dataset with specific features, trying to load all: {e}")
    ds = load_dataset("json", data_files=data_files, split="train")
    print("Dataset loaded with all features.")


end_time_load = datetime.datetime.now()
# Check if 'title' column exists after loading
if 'title' not in ds.column_names:
    raise ValueError("The loaded dataset does not contain a 'title' column. Please check the JSONL structure or feature loading.")
print(ds)
print(f"Time taken for loading: {end_time_load - start_time_load}\n")

# --- Step 6: Language Detection using .map (on Titles) ---
print("\n--- Running Language Detection on Titles ---")
# Define the detection function
def detect_language_batch(batch):
    # Check if 'title' column exists
    if "title" not in batch:
        print("Error: 'title' column not found in batch!")
        batch["detected_lang"] = ["error_missing_title"] * len(batch[next(iter(batch))])
        return batch

    texts = batch["title"] # <<<--- Use 'title' column
    detected_lang = [None] * len(texts)
    non_empty_texts = []
    non_empty_indices = []
    for i, text in enumerate(texts):
        # Titles might still be None or empty
        if text is None or text.strip() == "":
            detected_lang[i] = "empty"
        else:
            non_empty_texts.append(text)
            non_empty_indices.append(i)
    if non_empty_texts:
        try:
            # Titles are shorter, max_length 512 is likely excessive but safe
            predictions = lang_id_pipeline(
                non_empty_texts,
                batch_size=lang_id_pipeline_batch_size,
                truncation=True, padding=True, max_length=512
            )
            for idx, pred in zip(non_empty_indices, predictions):
                detected_lang[idx] = pred["label"]
        except Exception as e:
            print(f"Error in lang_id_pipeline within map: {e}")
            for idx in non_empty_indices:
                 detected_lang[idx] = "error"
    batch["detected_lang"] = detected_lang
    return batch

start_time_map_langid = datetime.datetime.now()
try:
    ds = ds.map(detect_language_batch, batched=True, batch_size=map_batch_size, num_proc=1)
    end_time_map_langid = datetime.datetime.now()
    print(f"Language detection completed in {end_time_map_langid - start_time_map_langid}")
except Exception as e:
    print(f"Error during language detection map: {e}")
    raise

# --- Step 7: Filter Non-English Titles ---
print("\n--- Filtering Non-English Titles ---")
start_time_filter = datetime.datetime.now()
exclude_langs = ['en', 'empty', 'error']
try:
    # Ensure 'detected_lang' column exists before filtering
    if 'detected_lang' not in ds.column_names:
        raise ValueError("'detected_lang' column not found after mapping.")
    non_en_ds = ds.filter(lambda example: example['detected_lang'] not in exclude_langs)
    num_non_english = len(non_en_ds)
except Exception as e:
    print(f"Error during filtering: {e}.")
    num_non_english = 0
    non_en_ds = None

end_time_filter = datetime.datetime.now()
print(f"Found {num_non_english} non-English titles to translate.")
print(f"Time taken for filtering: {end_time_filter - start_time_filter}")

# --- Step 8: Load Translation Model/Pipeline ---
translation_pipeline = None
if num_non_english > 0:
    print("\n--- Setting up Translation Model ---")
    start_time_setup_trans = datetime.datetime.now()
    trans_model_name = "facebook/m2m100_418M"
    print(f"Loading translation model '{trans_model_name}'...")
    try:
        trans_tokenizer = AutoTokenizer.from_pretrained(trans_model_name)
        trans_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_name).to(device)
        translation_pipeline = pipeline("translation",
                                        model=trans_model,
                                        tokenizer=trans_tokenizer,
                                        device=0 if device.type == 'cuda' else -1)
        print("Translation Pipeline loaded successfully.")
    except Exception as e:
        print(f"Error loading translation model: {e}")
        translation_pipeline = None
    end_time_setup_trans = datetime.datetime.now()
    print(f"    Time taken for translation setup: {end_time_setup_trans - start_time_setup_trans}")

# --- Step 9: Translate Non-English Titles & DOWNLOAD ---
translated_ds = None # Initialize translated_ds
if num_non_english > 0 and translation_pipeline and non_en_ds is not None:
    print("\n--- Running Translation on Titles ---")

    # Define the OPTIMIZED translation function for map (groups by language)
    def translate_batch(batch):
        """Translates titles in a batch to English using M2M100, grouping by source language."""
        start_time_batch_trans = datetime.datetime.now()
        texts_by_lang = defaultdict(list)
        indices_by_lang = defaultdict(list)
        original_indices_processed = {}

        # Check if 'title' and 'detected_lang' columns exist
        if "title" not in batch or "detected_lang" not in batch:
             print("Error: Missing 'title' or 'detected_lang' column in batch for translation!")
             batch["translated_title"] = ["Error: Missing input column"] * len(batch[next(iter(batch))])
             return batch

        for i, (text, lang) in enumerate(zip(batch["title"], batch["detected_lang"])): # <<<--- Use 'title' column
            if text and text.strip() and lang:
                texts_by_lang[lang].append(text)
                indices_by_lang[lang].append(i)
                original_indices_processed[i] = None

        translations = [None] * len(batch["title"]) # <<<--- Use length of 'title'
        processed_count = 0
        error_count = 0

        if texts_by_lang:
            for lang_code, texts_list in texts_by_lang.items():
                try:
                    # Titles are shorter, max_length 256 is likely okay
                    pipeline_output = translation_pipeline(
                        texts_list,
                        src_lang=lang_code,
                        tgt_lang='en',
                        batch_size=translation_pipeline_batch_size,
                        truncation=True,
                        max_length=256 # Adjust if needed
                    )
                    translated_texts = [p['translation_text'] for p in pipeline_output]
                    processed_count += len(translated_texts)
                    original_indices = indices_by_lang[lang_code]
                    for original_idx, translated_text in zip(original_indices, translated_texts):
                        translations[original_idx] = translated_text
                except Exception as e:
                    print(f"        ERROR translating group for lang '{lang_code}': {e}")
                    error_count += len(texts_list)
                    original_indices = indices_by_lang[lang_code]
                    for original_idx in original_indices:
                        translations[original_idx] = f"Error: Translation failed ({lang_code})"

        for i in range(len(translations)):
             if i not in original_indices_processed and translations[i] is None:
                 translations[i] = "Error: Skipped"

        batch["translated_title"] = translations
        end_time_batch_trans = datetime.datetime.now()
        gc.collect()
        return batch

    start_time_map_trans = datetime.datetime.now()
    print(f"Applying translation map (batch size: {map_batch_size})...")
    try:
        # Apply to the non_en_ds which contains 'title' and 'detected_lang'
        translated_ds = non_en_ds.map(translate_batch, batched=True, batch_size=map_batch_size, num_proc=1)
        end_time_map_trans = datetime.datetime.now()
        print(f"Translation completed in {end_time_map_trans - start_time_map_trans}")

        # --- *** NEW: Prepare and Download JSONL File *** ---
        if translated_ds and len(translated_ds) > 0:
            print("\nPreparing JSONL file for download...")
            output_filename = "translated_titles.jsonl"
            # Use Colab's temporary storage
            output_filepath = f"/content/{output_filename}"

            try:
                with open(output_filepath, 'w', encoding='utf-8') as f_out:
                    for record in translated_ds:
                        # Create a dictionary with desired fields
                        output_record = {
                            "id": record.get("id"),
                            "original_title": record.get("title"),
                            "detected_language": record.get("detected_lang"),
                            "translated_title": record.get("translated_title")
                        }
                        # Write the dictionary as a JSON line
                        json_line = json.dumps(output_record, ensure_ascii=False)
                        f_out.write(json_line + '\n')

                print(f"JSONL file created at {output_filepath}. Triggering download...")
                # Trigger browser download
                files.download(output_filepath)
                print("Download initiated.")

            except Exception as e:
                print(f"Error creating or downloading JSONL file: {e}")
        else:
             print("\nNo translated data to download.")
        # --- *** END of Download Section *** ---

        # Display some examples (Optional, can be commented out if download is primary goal)
        print("\n--- Translation Examples (Titles) ---")
        num_examples = 5
        for i in range(min(num_examples, len(translated_ds))):
            print(f"Original Lang: {translated_ds[i]['detected_lang']}")
            print(f"Original Title: {translated_ds[i]['title']}")
            print(f"Translated Title: {translated_ds[i]['translated_title']}")
            print("-" * 20)

    except Exception as e:
        print(f"Error during translation map: {e}")


elif num_non_english > 0:
    print("\n--- Translation Skipped (Pipeline failed to load) ---")
else:
    print("\n--- Translation Skipped (No non-English titles found) ---")


# --- Step 10: Final Language Counts (Original Detection on Titles) ---
print("\n--- Final Language Counts (from initial detection on Titles) ---")
# Check if dataset 'ds' and column 'detected_lang' exist before counting
if 'ds' in locals() and hasattr(ds, 'column_names') and 'detected_lang' in ds.column_names:
    language_counts = Counter(ds["detected_lang"])
    total_items = len(ds) # Changed variable name for clarity
    print(f"Total titles processed initially: {total_items}")
    print("--- Aggregated Language Distribution ---")
    for lang, count in language_counts.most_common():
        print(f"  {lang}: {count}")

    print(f"\n--- Final Summary (based on initial detection) ---")
    non_english_count_detected = sum(count for lang, count in language_counts.items() if lang not in ['en', 'empty', 'error'])
    print(f"Total number of non-English titles identified initially (excluding errors/empty): {non_english_count_detected}")
else:
    print("Could not calculate final counts - initial dataset 'ds' or 'detected_lang' column may be missing or failed.")


print("\n--- Script Finished ---")

Required packages installed.
Libraries imported.
PyTorch version: 2.6.0+cu124
Transformers version: 4.51.1
Datasets version: 3.5.0

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

--- Setting up Language Detection ---
GPU detected. Using device: NVIDIA L4
Loading LangID model 'papluca/xlm-roberta-base-language-detection'...


Device set to use cuda:0


LangID Pipeline loaded successfully.
    Time taken for LangID setup: 0:00:02.316979

--- Loading Dataset ---


Generating train split: 0 examples [00:00, ? examples/s]

Error loading dataset with specific features, trying to load all: An error occurred while generating the dataset
Dataset loaded with all features.
Dataset({
    features: ['id', 'title', 'abstract', 'authors', 'createdDate', 'doi', 'arxivId', 'pubmedId', 'magId', 'oaiIds', 'links', 'publishedDate', 'updatedDate'],
    num_rows: 14265
})
Time taken for loading: 0:00:00.987081


--- Running Language Detection on Titles ---
Language detection completed in 0:00:04.534110

--- Filtering Non-English Titles ---
Found 3891 non-English titles to translate.
Time taken for filtering: 0:00:00.003045

--- Setting up Translation Model ---
Loading translation model 'facebook/m2m100_418M'...


Device set to use cuda:0


Translation Pipeline loaded successfully.
    Time taken for translation setup: 0:00:04.376145

--- Running Translation on Titles ---
Applying translation map (batch size: 128)...


Map:   0%|          | 0/3891 [00:00<?, ? examples/s]

Translation completed in 0:08:15.675633

Preparing JSONL file for download...
JSONL file created at /content/translated_titles.jsonl. Triggering download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated.

--- Translation Examples (Titles) ---
Original Lang: pt
Original Title: Complejidad descriptiva y computacional en maquinas de Turing pequenas
Translated Title: Descriptive and computing complexity in small Turing machines
--------------------
Original Lang: it
Original Title: Rotating Membranes in AdS_4xM^{1,1,1}
Translated Title: It’s a good idea for you to be able to do it.
--------------------
Original Lang: it
Original Title: Anosov branches of dynamo spectra in one dimensional plasmas
Translated Title: Anosov branches of dynamo spectrum in one-dimensional plasmas
--------------------
Original Lang: it
Original Title: Chirp Control of Sinusoidal Lattice Modes in Bose-Einstein Condensate
Translated Title: Chirp Control of Sinusoidal Milk Modes in Bose-Einstein Condensate
--------------------
Original Lang: pt
Original Title: Derived Algebraic Geometry VI: E_k Algebras
Translated Title: Derived Algebraic Geometry VI: E_k
--------------------

--- Final Language 

In [None]:
!ls /content/drive/MyDrive/unzipped_longeval_abstracts

documents_000019.jsonl	documents_000020.jsonl	documents_000021.jsonl


In [2]:
# --- Prerequisites ---
# 1. Ensure you are using a GPU runtime in Colab (recommended for faster detection).
# 2. Ensure your .jsonl files (documents_000019.jsonl, etc.) are in the specified folder on Google Drive.
# 3. Set the folder path and the file index range in Step 3 below.

# --- Step 0: Install necessary packages ---
!pip install langdetect transformers torch accelerate datasets sentencepiece -q
print("Required packages installed.")

# --- Step 1: Import required libraries ---
from collections import Counter
import os
import io
from google.colab import drive
import datetime
import torch
import gc # Garbage Collector interface
import logging
import json # <--- Added back for manual JSON parsing

# Added for Hugging Face Transformers & Datasets
import transformers
import datasets
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset, load_dataset # Keep load_dataset in case manual parse fails fallback is added later, but not used in current logic

# --- Configure Logging ---
logging.getLogger("transformers").setLevel(logging.WARNING) # Reduce transformers verbosity

print("Libraries imported.")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

# --- Step 2: Mount Google Drive ---
print("\nMounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    raise

# --- Step 3: Configuration ---
# !!! 1. Set this path to the FOLDER containing the extracted .jsonl files !!!
jsonl_folder_path = '/content/drive/MyDrive/unzipped_longeval_abstracts' # <--- SET FOLDER PATH

# !!! 2. Set the START and END index for the files to process !!!
# This will process documents_000019.jsonl, documents_000020.jsonl, documents_000021.jsonl
start_file_index = 1
end_file_index = 21
filename_prefix = "documents_" # Common prefix for filenames
filename_suffix = ".jsonl"   # Common suffix for filenames

# --- Parameters ---
map_batch_size = 128             # Batch size for dataset mapping operations (language detection)
lang_id_pipeline_batch_size = 256 # Batch size for language detection pipeline itself

# --- Step 4: Language Detection Model/Pipeline Setup (Load Once) ---
print("\n--- Setting up Language Detection Model (Loading Once) ---")
start_time_setup_langid = datetime.datetime.now()
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU detected. Using device: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()
    gc.collect()
else:
    device = torch.device("cpu")
    print("Warning: GPU not detected. Using CPU (might be slow).")

lang_id_model_name = "papluca/xlm-roberta-base-language-detection"
lang_id_pipeline = None
lang_id_model = None
lang_id_tokenizer = None
try:
    print(f"Loading LangID model '{lang_id_model_name}'...")
    lang_id_tokenizer = AutoTokenizer.from_pretrained(lang_id_model_name)
    lang_id_model = AutoModelForSequenceClassification.from_pretrained(lang_id_model_name).to(device)
    lang_id_pipeline = pipeline("text-classification", model=lang_id_model, tokenizer=lang_id_tokenizer, device=0 if device.type == 'cuda' else -1)
    print("LangID Pipeline loaded successfully.")
except Exception as e:
    print(f"Error loading LangID model: {e}")
    # If model loading fails, we cannot proceed
    raise

end_time_setup_langid = datetime.datetime.now()
print(f"Time taken for LangID setup: {end_time_setup_langid - start_time_setup_langid}")


# --- Step 5: Loop Through Files and Process ---

print(f"\n--- Starting Processing Loop for Files {start_file_index} to {end_file_index} ---")

for i in range(start_file_index, end_file_index + 1):
    # Construct filename with zero-padding (e.g., 000019)
    current_filename = f"{filename_prefix}{i:06d}{filename_suffix}"
    current_file_path = os.path.join(jsonl_folder_path, current_filename)

    print(f"\n===== Processing File: {current_filename} =====")

    # --- 5.1: Load Dataset (Replaced with Manual Validation) ---
    print(f"\n--- Attempting Manual JSONL Validation for: {current_filename} ---")

    valid_lines_data = [] # Store dicts like {'title': '...'}
    error_line_num = -1
    json_error_detail = None
    line_content_on_error = ""
    ds = None # Ensure ds is reset for each file loop

    try:
        # --- Check file existence before attempting to open ---
        if not os.path.exists(current_file_path):
            raise FileNotFoundError(f"File not found at path: {current_file_path}")

        # --- Explicitly open with UTF-8 ---
        print(f"   Opening file: {current_file_path} with encoding='utf-8'")
        with open(current_file_path, 'r', encoding='utf-8') as f:
            for line_idx, line in enumerate(f):
                line_num = line_idx + 1 # 1-based line number
                # Skip empty or whitespace-only lines
                stripped_line = line.strip()
                if not stripped_line:
                    continue # Skip empty lines silently
                try:
                    # Attempt to parse the line as JSON
                    json_obj = json.loads(stripped_line)

                    # --- CRITICAL CHECK: Ensure 'title' exists and is a string ---
                    if 'title' not in json_obj:
                        print(f"   !!! FATAL ERROR: Line {line_num} is missing the 'title' key. Stopping validation for {current_filename}. !!!")
                        error_line_num = line_num
                        json_error_detail = "Missing 'title' key"
                        line_content_on_error = stripped_line[:200] + "..."
                        break # Treat missing title as a fatal error for this file
                    elif json_obj['title'] is None: # Explicitly check for None
                         print(f"   !!! FATAL ERROR: 'title' on line {line_num} is null/None. Stopping validation for {current_filename}. !!!")
                         error_line_num = line_num
                         json_error_detail = "'title' is null/None"
                         line_content_on_error = stripped_line[:200] + "..."
                         break # Treat null title as a fatal error
                    elif not isinstance(json_obj['title'], str):
                         print(f"   !!! FATAL ERROR: 'title' on line {line_num} is not a string (Type: {type(json_obj['title'])}). Stopping validation for {current_filename}. !!!")
                         error_line_num = line_num
                         json_error_detail = f"'title' is not a string (Type: {type(json_obj['title'])})"
                         line_content_on_error = stripped_line[:200] + "..."
                         break # Treat wrong type as a fatal error

                    # If checks pass, store only the title (as that's all we need later)
                    valid_lines_data.append({'title': json_obj['title']})

                except json.JSONDecodeError as e:
                    # Found an invalid JSON line
                    error_line_num = line_num
                    json_error_detail = e
                    line_content_on_error = stripped_line[:200] + "..."
                    print(f"   !!! Invalid JSON detected on line {error_line_num}: {e} !!!")
                    print(f"       Content preview: {line_content_on_error}")
                    break # Stop validation on first JSON error

    except FileNotFoundError:
        print(f"!!! File Not Found: {current_file_path}. Skipping. !!!")
        # Ensure list is cleared if partially populated before error
        del valid_lines_data[:]
        gc.collect()
        continue # Skip to next file in outer loop
    except Exception as e:
        # Catch other file reading errors (e.g., permission, other encoding issues)
        print(f"!!! Error reading file {current_file_path}: {e}. Skipping. !!!")
         # Ensure list is cleared
        del valid_lines_data[:]
        gc.collect()
        continue # Skip to next file in outer loop

    # --- Check results of manual validation ---
    if error_line_num != -1:
        print(f"--- Manual validation failed for {current_filename} on line {error_line_num} ({json_error_detail}). Cannot proceed with this file. ---")
        del valid_lines_data[:] # Clean up list
        gc.collect()
        continue # Skip to next file in outer loop
    elif not valid_lines_data:
         print(f"--- No valid JSON objects with 'title' strings found in {current_filename}. Skipping. ---")
         # list is already empty or cleared
         gc.collect()
         continue # Skip to next file in outer loop
    else:
        # --- If validation succeeded, create Dataset object ---
        print(f"--- Manual validation successful for {current_filename}. Found {len(valid_lines_data)} valid lines with 'title' strings. ---")
        start_time_load = datetime.datetime.now()
        try:
            # Create dataset directly from the list of validated title dictionaries
            ds = Dataset.from_list(valid_lines_data)
            print(f"   Dataset object created successfully from validated titles for '{current_filename}'.")
             # Check title column exists (should always exist based on how we created it)
            if 'title' not in ds.column_names:
                 # This case should theoretically not happen with from_list [{'title':...}]
                 raise ValueError("'title' column unexpectedly missing after Dataset.from_list")

        except Exception as e:
            print(f"!!! Error creating Dataset object from validated titles for '{current_filename}': {e}. Skipping. !!!")
            # ds might be partially created, ensure cleanup
            if 'ds' in locals() and ds is not None: del ds
            # Also clear original list data
            del valid_lines_data[:]
            gc.collect()
            continue # Skip file if dataset creation fails

        end_time_load = datetime.datetime.now()
        # Cleanup the temporary list now data is in Dataset object
        del valid_lines_data[:]
        gc.collect()

        # Proceed if ds is created successfully
        if ds is None:
             # This case should also not happen if exception handling is correct
            print(f"!!! Dataset object is None after creation attempt for '{current_filename}'. Skipping processing. !!!")
            continue

        print(ds) # Print dataset info
        print(f"Time taken for creating Dataset object for '{current_filename}': {end_time_load - start_time_load}\n")

    # === If we reach here, 'ds' is valid and contains the 'title' column ===

    # --- 5.2: Language Detection ---
    print(f"\n--- Running Language Detection on Titles for {current_filename} ---")

    # Define the detection function (remains the same)
    def detect_language_batch(batch):
        # No need to check for 'title' existence here, as we ensured it during loading/creation
        texts = batch["title"]
        detected_lang = [None] * len(texts)
        non_empty_texts = []
        non_empty_indices = []
        for i_batch, text in enumerate(texts):
            # We already checked for None/non-string during validation, but check empty/whitespace again
            if text is None or text.strip() == "":
                detected_lang[i_batch] = "empty" # Assign specific code if needed
            else:
                non_empty_texts.append(text)
                non_empty_indices.append(i_batch)
        if non_empty_texts:
            try:
                predictions = lang_id_pipeline(
                    non_empty_texts,
                    batch_size=lang_id_pipeline_batch_size,
                    truncation=True, padding=True, max_length=512 # Truncation is important
                )
                for idx, pred in zip(non_empty_indices, predictions):
                    detected_lang[idx] = pred["label"]
            except Exception as e_pipe:
                print(f"  Error in lang_id_pipeline during batch for {current_filename}: {e_pipe}")
                for idx in non_empty_indices:
                    detected_lang[idx] = "error" # Assign 'error' if pipeline fails for a batch
        batch["detected_lang"] = detected_lang
        return batch

    start_time_map_langid = datetime.datetime.now()
    detected_ds = None # Reset variable
    try:
        # Apply mapping to the dataset 'ds' created from validated titles
        detected_ds = ds.map(detect_language_batch, batched=True, batch_size=map_batch_size, num_proc=1)
        end_time_map_langid = datetime.datetime.now()
        print(f"Language detection for '{current_filename}' completed in {end_time_map_langid - start_time_map_langid}")
    except Exception as e:
        print(f"!!! Error during language detection map for '{current_filename}': {e}. Skipping statistics for this file. !!!")
        detected_ds = None # Ensure it's None if detection fails

    # --- 5.3: Display Language Statistics ---
    print(f"\n--- Language Statistics for 'title' field in: {current_filename} ---")
    if detected_ds is not None and 'detected_lang' in detected_ds.column_names:
        try:
            language_counts = Counter(detected_ds["detected_lang"])
            total_items = len(detected_ds)
            print(f"Total titles processed in {current_filename}: {total_items}")
            print("--- Language Distribution ---")
            if total_items > 0:
                # Sort by count descending for clarity
                for lang, count in language_counts.most_common():
                    percentage = (count / total_items) * 100
                    print(f"  {lang}: {count} ({percentage:.2f}%)")
            else:
                print("  No titles found or processed.")

            # Summary Stats
            en_count = language_counts.get('en', 0)
            empty_count = language_counts.get('empty', 0)
            error_count = language_counts.get('error', 0) # Errors from pipeline only now
            non_english_count_detected = total_items - en_count - empty_count - error_count

            print("\n--- Summary ---")
            print(f"English ('en'): {en_count}")
            print(f"Non-English (excluding empty/error): {non_english_count_detected}")
            print(f"Empty/Invalid Titles ('empty'): {empty_count}")
            print(f"Pipeline Processing Errors ('error'): {error_count}")

        except Exception as e:
            print(f"!!! Error calculating statistics for '{current_filename}': {e} !!!")
    else:
        print(f"Could not calculate statistics for '{current_filename}' - detection may have failed or dataset was invalid.")

    # --- 5.4: Clean up memory for the current file ---
    print(f"--- Cleaning up memory after processing {current_filename} ---")
    # ds was created from list, detected_ds is the mapped version
    if 'ds' in locals() and ds is not None: del ds
    if 'detected_ds' in locals() and detected_ds is not None: del detected_ds
    gc.collect() # Explicitly call garbage collector

# --- End of Loop ---
print("\n===== Finished Processing All Files =====")


# --- Step 6: Final Cleanup (Model) ---
print("\n--- Final Cleanup (Language Detection Model) ---")
del lang_id_pipeline
del lang_id_model
del lang_id_tokenizer
gc.collect()
if device.type == 'cuda':
    torch.cuda.empty_cache()
print("Cleanup complete.")

print("\n--- Script Finished ---")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m109.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.9 MB/s[0m eta [

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cuda:0


LangID Pipeline loaded successfully.
Time taken for LangID setup: 0:00:11.643276

--- Starting Processing Loop for Files 1 to 21 ---

===== Processing File: documents_000001.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000001.jsonl ---
   Opening file: /content/drive/MyDrive/unzipped_longeval_abstracts/documents_000001.jsonl with encoding='utf-8'
--- Manual validation successful for documents_000001.jsonl. Found 100000 valid lines with 'title' strings. ---
   Dataset object created successfully from validated titles for 'documents_000001.jsonl'.
Dataset({
    features: ['title'],
    num_rows: 100000
})
Time taken for creating Dataset object for 'documents_000001.jsonl': 0:00:00.108535


--- Running Language Detection on Titles for documents_000001.jsonl ---


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Language detection for 'documents_000001.jsonl' completed in 0:03:14.813677

--- Language Statistics for 'title' field in: documents_000001.jsonl ---
Total titles processed in documents_000001.jsonl: 100000
--- Language Distribution ---
  en: 73219 (73.22%)
  es: 4339 (4.34%)
  pt: 4026 (4.03%)
  it: 3971 (3.97%)
  nl: 3000 (3.00%)
  fr: 2136 (2.14%)
  de: 2021 (2.02%)
  ru: 1529 (1.53%)
  ur: 1324 (1.32%)
  ja: 809 (0.81%)
  tr: 703 (0.70%)
  sw: 612 (0.61%)
  pl: 598 (0.60%)
  hi: 506 (0.51%)
  bg: 389 (0.39%)
  th: 214 (0.21%)
  zh: 211 (0.21%)
  el: 211 (0.21%)
  ar: 129 (0.13%)
  vi: 53 (0.05%)

--- Summary ---
English ('en'): 73219
Non-English (excluding empty/error): 26781
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000001.jsonl ---

===== Processing File: documents_000002.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000002.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000002.jsonl' completed in 0:03:18.112578

--- Language Statistics for 'title' field in: documents_000002.jsonl ---
Total titles processed in documents_000002.jsonl: 100000
--- Language Distribution ---
  en: 71158 (71.16%)
  es: 4886 (4.89%)
  pt: 4332 (4.33%)
  it: 3956 (3.96%)
  nl: 3363 (3.36%)
  fr: 2289 (2.29%)
  de: 2153 (2.15%)
  ru: 1593 (1.59%)
  ur: 1388 (1.39%)
  ja: 820 (0.82%)
  tr: 728 (0.73%)
  pl: 722 (0.72%)
  sw: 649 (0.65%)
  hi: 604 (0.60%)
  bg: 419 (0.42%)
  zh: 261 (0.26%)
  th: 243 (0.24%)
  el: 229 (0.23%)
  ar: 162 (0.16%)
  vi: 45 (0.04%)

--- Summary ---
English ('en'): 71158
Non-English (excluding empty/error): 28842
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000002.jsonl ---

===== Processing File: documents_000003.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000003.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000003.jsonl' completed in 0:03:27.243568

--- Language Statistics for 'title' field in: documents_000003.jsonl ---
Total titles processed in documents_000003.jsonl: 100000
--- Language Distribution ---
  en: 69091 (69.09%)
  es: 5284 (5.28%)
  pt: 4942 (4.94%)
  it: 4225 (4.23%)
  nl: 3294 (3.29%)
  fr: 2354 (2.35%)
  de: 2214 (2.21%)
  ru: 1868 (1.87%)
  ur: 1493 (1.49%)
  ja: 928 (0.93%)
  tr: 832 (0.83%)
  pl: 804 (0.80%)
  sw: 635 (0.64%)
  hi: 552 (0.55%)
  bg: 487 (0.49%)
  zh: 274 (0.27%)
  th: 263 (0.26%)
  el: 234 (0.23%)
  ar: 154 (0.15%)
  vi: 72 (0.07%)

--- Summary ---
English ('en'): 69091
Non-English (excluding empty/error): 30909
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000003.jsonl ---

===== Processing File: documents_000004.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000004.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000004.jsonl' completed in 0:03:28.674691

--- Language Statistics for 'title' field in: documents_000004.jsonl ---
Total titles processed in documents_000004.jsonl: 100000
--- Language Distribution ---
  en: 68529 (68.53%)
  es: 5275 (5.27%)
  pt: 4954 (4.95%)
  it: 4226 (4.23%)
  nl: 3434 (3.43%)
  fr: 2412 (2.41%)
  de: 2265 (2.27%)
  ru: 1952 (1.95%)
  ur: 1529 (1.53%)
  ja: 927 (0.93%)
  tr: 897 (0.90%)
  pl: 856 (0.86%)
  sw: 685 (0.69%)
  hi: 576 (0.58%)
  bg: 453 (0.45%)
  zh: 300 (0.30%)
  el: 259 (0.26%)
  th: 236 (0.24%)
  ar: 159 (0.16%)
  vi: 76 (0.08%)

--- Summary ---
English ('en'): 68529
Non-English (excluding empty/error): 31471
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000004.jsonl ---

===== Processing File: documents_000005.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000005.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000005.jsonl' completed in 0:03:30.433227

--- Language Statistics for 'title' field in: documents_000005.jsonl ---
Total titles processed in documents_000005.jsonl: 100000
--- Language Distribution ---
  en: 66858 (66.86%)
  pt: 5646 (5.65%)
  es: 5473 (5.47%)
  it: 4318 (4.32%)
  nl: 3521 (3.52%)
  fr: 2899 (2.90%)
  de: 2254 (2.25%)
  ru: 1986 (1.99%)
  ur: 1469 (1.47%)
  ja: 1011 (1.01%)
  pl: 899 (0.90%)
  tr: 858 (0.86%)
  sw: 691 (0.69%)
  hi: 594 (0.59%)
  bg: 411 (0.41%)
  zh: 365 (0.36%)
  th: 296 (0.30%)
  el: 207 (0.21%)
  ar: 165 (0.17%)
  vi: 78 (0.08%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 66858
Non-English (excluding empty/error): 33141
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000005.jsonl ---

===== Processing File: documents_000006.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000006.jsonl ---
   Opening file: /content

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000006.jsonl' completed in 0:03:27.853297

--- Language Statistics for 'title' field in: documents_000006.jsonl ---
Total titles processed in documents_000006.jsonl: 100000
--- Language Distribution ---
  en: 65752 (65.75%)
  pt: 5938 (5.94%)
  es: 5882 (5.88%)
  nl: 4136 (4.14%)
  it: 4084 (4.08%)
  fr: 2725 (2.73%)
  de: 2389 (2.39%)
  ru: 1874 (1.87%)
  ur: 1506 (1.51%)
  ja: 1032 (1.03%)
  tr: 914 (0.91%)
  pl: 809 (0.81%)
  sw: 717 (0.72%)
  hi: 633 (0.63%)
  bg: 449 (0.45%)
  th: 339 (0.34%)
  zh: 275 (0.27%)
  el: 257 (0.26%)
  ar: 197 (0.20%)
  vi: 92 (0.09%)

--- Summary ---
English ('en'): 65752
Non-English (excluding empty/error): 34248
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000006.jsonl ---

===== Processing File: documents_000007.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000007.jsonl ---
   Opening file: /content/drive/MyDrive/unzi

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000007.jsonl' completed in 0:03:31.465014

--- Language Statistics for 'title' field in: documents_000007.jsonl ---
Total titles processed in documents_000007.jsonl: 100000
--- Language Distribution ---
  en: 66738 (66.74%)
  es: 5787 (5.79%)
  pt: 5419 (5.42%)
  it: 4353 (4.35%)
  nl: 3321 (3.32%)
  fr: 2916 (2.92%)
  de: 2386 (2.39%)
  ru: 1846 (1.85%)
  ur: 1362 (1.36%)
  tr: 1075 (1.07%)
  ja: 1010 (1.01%)
  pl: 765 (0.77%)
  bg: 673 (0.67%)
  sw: 646 (0.65%)
  hi: 557 (0.56%)
  zh: 440 (0.44%)
  el: 254 (0.25%)
  th: 221 (0.22%)
  ar: 165 (0.17%)
  vi: 66 (0.07%)

--- Summary ---
English ('en'): 66738
Non-English (excluding empty/error): 33262
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000007.jsonl ---

===== Processing File: documents_000008.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000008.jsonl ---
   Opening file: /content/drive/MyDrive/unz

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000008.jsonl' completed in 0:03:25.955696

--- Language Statistics for 'title' field in: documents_000008.jsonl ---
Total titles processed in documents_000008.jsonl: 100000
--- Language Distribution ---
  en: 61850 (61.85%)
  es: 8053 (8.05%)
  pt: 5936 (5.94%)
  nl: 4225 (4.23%)
  it: 4089 (4.09%)
  fr: 3070 (3.07%)
  de: 2684 (2.68%)
  ru: 2275 (2.27%)
  ur: 1557 (1.56%)
  ja: 1134 (1.13%)
  pl: 1016 (1.02%)
  tr: 924 (0.92%)
  hi: 722 (0.72%)
  sw: 673 (0.67%)
  bg: 458 (0.46%)
  th: 358 (0.36%)
  zh: 336 (0.34%)
  el: 283 (0.28%)
  ar: 247 (0.25%)
  vi: 110 (0.11%)

--- Summary ---
English ('en'): 61850
Non-English (excluding empty/error): 38150
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000008.jsonl ---

===== Processing File: documents_000009.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000009.jsonl ---
   Opening file: /content/drive/MyDrive/un

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000009.jsonl' completed in 0:03:24.358568

--- Language Statistics for 'title' field in: documents_000009.jsonl ---
Total titles processed in documents_000009.jsonl: 100000
--- Language Distribution ---
  en: 69594 (69.59%)
  es: 4980 (4.98%)
  pt: 4917 (4.92%)
  it: 4073 (4.07%)
  nl: 3378 (3.38%)
  fr: 2857 (2.86%)
  de: 2396 (2.40%)
  ru: 1487 (1.49%)
  ur: 1367 (1.37%)
  ja: 949 (0.95%)
  pl: 786 (0.79%)
  tr: 703 (0.70%)
  sw: 687 (0.69%)
  hi: 577 (0.58%)
  bg: 310 (0.31%)
  zh: 256 (0.26%)
  th: 238 (0.24%)
  el: 224 (0.22%)
  ar: 150 (0.15%)
  vi: 71 (0.07%)

--- Summary ---
English ('en'): 69594
Non-English (excluding empty/error): 30406
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000009.jsonl ---

===== Processing File: documents_000010.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000010.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000010.jsonl' completed in 0:03:21.556377

--- Language Statistics for 'title' field in: documents_000010.jsonl ---
Total titles processed in documents_000010.jsonl: 100000
--- Language Distribution ---
  en: 63921 (63.92%)
  es: 7635 (7.63%)
  pt: 6087 (6.09%)
  nl: 4153 (4.15%)
  it: 3830 (3.83%)
  de: 2676 (2.68%)
  ru: 2284 (2.28%)
  fr: 2159 (2.16%)
  ur: 1651 (1.65%)
  ja: 982 (0.98%)
  tr: 917 (0.92%)
  pl: 831 (0.83%)
  sw: 706 (0.71%)
  hi: 636 (0.64%)
  bg: 428 (0.43%)
  th: 340 (0.34%)
  zh: 315 (0.32%)
  el: 202 (0.20%)
  ar: 165 (0.17%)
  vi: 82 (0.08%)

--- Summary ---
English ('en'): 63921
Non-English (excluding empty/error): 36079
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000010.jsonl ---

===== Processing File: documents_000011.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000011.jsonl ---
   Opening file: /content/drive/MyDrive/unzip

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000011.jsonl' completed in 0:03:25.564446

--- Language Statistics for 'title' field in: documents_000011.jsonl ---
Total titles processed in documents_000011.jsonl: 100000
--- Language Distribution ---
  en: 63085 (63.09%)
  es: 7438 (7.44%)
  pt: 6016 (6.02%)
  nl: 4464 (4.46%)
  it: 3836 (3.84%)
  de: 2598 (2.60%)
  ru: 2560 (2.56%)
  fr: 2302 (2.30%)
  ur: 1623 (1.62%)
  ja: 1103 (1.10%)
  tr: 936 (0.94%)
  pl: 934 (0.93%)
  sw: 698 (0.70%)
  hi: 668 (0.67%)
  bg: 488 (0.49%)
  zh: 426 (0.43%)
  th: 353 (0.35%)
  el: 234 (0.23%)
  ar: 149 (0.15%)
  vi: 89 (0.09%)

--- Summary ---
English ('en'): 63085
Non-English (excluding empty/error): 36915
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000011.jsonl ---

===== Processing File: documents_000012.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000012.jsonl ---
   Opening file: /content/drive/MyDrive/unzi

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000012.jsonl' completed in 0:03:27.569393

--- Language Statistics for 'title' field in: documents_000012.jsonl ---
Total titles processed in documents_000012.jsonl: 100000
--- Language Distribution ---
  en: 61822 (61.82%)
  es: 6269 (6.27%)
  pt: 5896 (5.90%)
  nl: 4468 (4.47%)
  it: 3900 (3.90%)
  ru: 2490 (2.49%)
  de: 2469 (2.47%)
  fr: 2464 (2.46%)
  ja: 1900 (1.90%)
  ur: 1752 (1.75%)
  zh: 1543 (1.54%)
  tr: 987 (0.99%)
  pl: 979 (0.98%)
  hi: 742 (0.74%)
  sw: 719 (0.72%)
  bg: 480 (0.48%)
  th: 477 (0.48%)
  el: 260 (0.26%)
  ar: 236 (0.24%)
  vi: 145 (0.14%)
  empty: 2 (0.00%)

--- Summary ---
English ('en'): 61822
Non-English (excluding empty/error): 38176
Empty/Invalid Titles ('empty'): 2
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000012.jsonl ---

===== Processing File: documents_000013.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000013.jsonl ---
   Opening file: /conte

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000013.jsonl' completed in 0:03:30.249807

--- Language Statistics for 'title' field in: documents_000013.jsonl ---
Total titles processed in documents_000013.jsonl: 100000
--- Language Distribution ---
  en: 60886 (60.89%)
  es: 6967 (6.97%)
  pt: 6164 (6.16%)
  it: 4472 (4.47%)
  nl: 4185 (4.18%)
  fr: 3234 (3.23%)
  de: 2740 (2.74%)
  ru: 2603 (2.60%)
  ur: 1783 (1.78%)
  ja: 1503 (1.50%)
  pl: 1151 (1.15%)
  tr: 1007 (1.01%)
  hi: 751 (0.75%)
  sw: 664 (0.66%)
  bg: 573 (0.57%)
  zh: 402 (0.40%)
  el: 314 (0.31%)
  th: 300 (0.30%)
  ar: 211 (0.21%)
  vi: 89 (0.09%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 60886
Non-English (excluding empty/error): 39113
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000013.jsonl ---

===== Processing File: documents_000014.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000014.jsonl ---
   Opening file: /conte

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000014.jsonl' completed in 0:03:33.044718

--- Language Statistics for 'title' field in: documents_000014.jsonl ---
Total titles processed in documents_000014.jsonl: 100000
--- Language Distribution ---
  en: 61802 (61.80%)
  es: 6811 (6.81%)
  pt: 6585 (6.59%)
  it: 4434 (4.43%)
  nl: 4037 (4.04%)
  fr: 3522 (3.52%)
  de: 2572 (2.57%)
  ru: 2265 (2.27%)
  ur: 1697 (1.70%)
  ja: 1289 (1.29%)
  pl: 961 (0.96%)
  tr: 922 (0.92%)
  sw: 728 (0.73%)
  hi: 649 (0.65%)
  bg: 457 (0.46%)
  zh: 381 (0.38%)
  th: 342 (0.34%)
  el: 262 (0.26%)
  ar: 198 (0.20%)
  vi: 85 (0.08%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 61802
Non-English (excluding empty/error): 38197
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000014.jsonl ---

===== Processing File: documents_000015.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000015.jsonl ---
   Opening file: /content

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000015.jsonl' completed in 0:03:41.138926

--- Language Statistics for 'title' field in: documents_000015.jsonl ---
Total titles processed in documents_000015.jsonl: 100000
--- Language Distribution ---
  en: 59767 (59.77%)
  es: 8166 (8.17%)
  pt: 6533 (6.53%)
  it: 4439 (4.44%)
  nl: 4069 (4.07%)
  fr: 3291 (3.29%)
  de: 2778 (2.78%)
  ru: 2325 (2.33%)
  ur: 1707 (1.71%)
  ja: 1303 (1.30%)
  tr: 1170 (1.17%)
  pl: 1039 (1.04%)
  sw: 670 (0.67%)
  hi: 666 (0.67%)
  bg: 619 (0.62%)
  zh: 442 (0.44%)
  th: 361 (0.36%)
  el: 303 (0.30%)
  ar: 257 (0.26%)
  vi: 94 (0.09%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 59767
Non-English (excluding empty/error): 40232
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000015.jsonl ---

===== Processing File: documents_000016.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000016.jsonl ---
   Opening file: /conte

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000016.jsonl' completed in 0:03:35.694445

--- Language Statistics for 'title' field in: documents_000016.jsonl ---
Total titles processed in documents_000016.jsonl: 100000
--- Language Distribution ---
  en: 60359 (60.36%)
  es: 7598 (7.60%)
  pt: 6553 (6.55%)
  nl: 4391 (4.39%)
  it: 4199 (4.20%)
  de: 3073 (3.07%)
  fr: 2952 (2.95%)
  ru: 2405 (2.40%)
  ur: 1899 (1.90%)
  ja: 1293 (1.29%)
  pl: 1083 (1.08%)
  tr: 912 (0.91%)
  hi: 732 (0.73%)
  sw: 727 (0.73%)
  bg: 498 (0.50%)
  th: 403 (0.40%)
  zh: 364 (0.36%)
  el: 258 (0.26%)
  ar: 194 (0.19%)
  vi: 107 (0.11%)

--- Summary ---
English ('en'): 60359
Non-English (excluding empty/error): 39641
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000016.jsonl ---

===== Processing File: documents_000017.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000017.jsonl ---
   Opening file: /content/drive/MyDrive/un

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000017.jsonl' completed in 0:03:37.193750

--- Language Statistics for 'title' field in: documents_000017.jsonl ---
Total titles processed in documents_000017.jsonl: 100000
--- Language Distribution ---
  en: 58548 (58.55%)
  es: 7732 (7.73%)
  pt: 6559 (6.56%)
  nl: 4478 (4.48%)
  it: 4085 (4.08%)
  fr: 2787 (2.79%)
  ru: 2663 (2.66%)
  de: 2624 (2.62%)
  ja: 2257 (2.26%)
  ur: 1825 (1.82%)
  zh: 1163 (1.16%)
  pl: 1113 (1.11%)
  tr: 1087 (1.09%)
  sw: 741 (0.74%)
  hi: 740 (0.74%)
  bg: 522 (0.52%)
  th: 399 (0.40%)
  el: 282 (0.28%)
  ar: 242 (0.24%)
  vi: 152 (0.15%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 58548
Non-English (excluding empty/error): 41451
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000017.jsonl ---

===== Processing File: documents_000018.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000018.jsonl ---
   Opening file: /con

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000018.jsonl' completed in 0:03:32.412416

--- Language Statistics for 'title' field in: documents_000018.jsonl ---
Total titles processed in documents_000018.jsonl: 100000
--- Language Distribution ---
  en: 60630 (60.63%)
  es: 7081 (7.08%)
  pt: 6522 (6.52%)
  it: 4419 (4.42%)
  nl: 4081 (4.08%)
  fr: 3344 (3.34%)
  de: 2772 (2.77%)
  ru: 2573 (2.57%)
  ur: 1770 (1.77%)
  ja: 1508 (1.51%)
  pl: 1094 (1.09%)
  tr: 936 (0.94%)
  sw: 719 (0.72%)
  hi: 695 (0.69%)
  bg: 526 (0.53%)
  zh: 407 (0.41%)
  th: 362 (0.36%)
  el: 269 (0.27%)
  ar: 198 (0.20%)
  vi: 93 (0.09%)
  empty: 1 (0.00%)

--- Summary ---
English ('en'): 60630
Non-English (excluding empty/error): 39369
Empty/Invalid Titles ('empty'): 1
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000018.jsonl ---

===== Processing File: documents_000019.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000019.jsonl ---
   Opening file: /conten

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000019.jsonl' completed in 0:03:34.440645

--- Language Statistics for 'title' field in: documents_000019.jsonl ---
Total titles processed in documents_000019.jsonl: 100000
--- Language Distribution ---
  en: 58545 (58.55%)
  es: 7835 (7.83%)
  pt: 6800 (6.80%)
  it: 4351 (4.35%)
  fr: 4017 (4.02%)
  nl: 4002 (4.00%)
  de: 3180 (3.18%)
  ru: 2405 (2.40%)
  ur: 1819 (1.82%)
  ja: 1412 (1.41%)
  tr: 1166 (1.17%)
  pl: 1158 (1.16%)
  sw: 678 (0.68%)
  hi: 635 (0.64%)
  bg: 630 (0.63%)
  zh: 420 (0.42%)
  th: 337 (0.34%)
  el: 317 (0.32%)
  ar: 200 (0.20%)
  vi: 93 (0.09%)

--- Summary ---
English ('en'): 58545
Non-English (excluding empty/error): 41455
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000019.jsonl ---

===== Processing File: documents_000020.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000020.jsonl ---
   Opening file: /content/drive/MyDrive/un

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Language detection for 'documents_000020.jsonl' completed in 0:03:29.716524

--- Language Statistics for 'title' field in: documents_000020.jsonl ---
Total titles processed in documents_000020.jsonl: 100000
--- Language Distribution ---
  en: 61653 (61.65%)
  es: 7413 (7.41%)
  pt: 6166 (6.17%)
  nl: 4258 (4.26%)
  it: 3995 (4.00%)
  ru: 2476 (2.48%)
  de: 2399 (2.40%)
  fr: 2340 (2.34%)
  ja: 1805 (1.80%)
  ur: 1724 (1.72%)
  pl: 1062 (1.06%)
  tr: 967 (0.97%)
  zh: 870 (0.87%)
  sw: 711 (0.71%)
  hi: 706 (0.71%)
  bg: 442 (0.44%)
  th: 395 (0.40%)
  el: 274 (0.27%)
  ar: 218 (0.22%)
  vi: 126 (0.13%)

--- Summary ---
English ('en'): 61653
Non-English (excluding empty/error): 38347
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000020.jsonl ---

===== Processing File: documents_000021.jsonl =====

--- Attempting Manual JSONL Validation for: documents_000021.jsonl ---
   Opening file: /content/drive/MyDrive/un

Map:   0%|          | 0/14265 [00:00<?, ? examples/s]

Language detection for 'documents_000021.jsonl' completed in 0:00:30.798329

--- Language Statistics for 'title' field in: documents_000021.jsonl ---
Total titles processed in documents_000021.jsonl: 14265
--- Language Distribution ---
  en: 10374 (72.72%)
  nl: 677 (4.75%)
  pt: 672 (4.71%)
  it: 553 (3.88%)
  es: 515 (3.61%)
  fr: 304 (2.13%)
  ru: 233 (1.63%)
  ur: 139 (0.97%)
  hi: 125 (0.88%)
  de: 124 (0.87%)
  sw: 119 (0.83%)
  ja: 110 (0.77%)
  tr: 101 (0.71%)
  pl: 57 (0.40%)
  th: 45 (0.32%)
  zh: 41 (0.29%)
  el: 28 (0.20%)
  bg: 26 (0.18%)
  ar: 17 (0.12%)
  vi: 5 (0.04%)

--- Summary ---
English ('en'): 10374
Non-English (excluding empty/error): 3891
Empty/Invalid Titles ('empty'): 0
Pipeline Processing Errors ('error'): 0
--- Cleaning up memory after processing documents_000021.jsonl ---

===== Finished Processing All Files =====

--- Final Cleanup (Language Detection Model) ---
Cleanup complete.

--- Script Finished ---
