In [15]:
import json, csv

Looking at the user request, they want to add a conversion rule to handle coordinate formatting where "36 ° 5 ′ 30 '' N " should become "36°5′30  N " (removing spaces around degree, minute, and second symbols).



In [30]:
def detokenize_text(text):
    """
    Fixes excessive tokenization in text by restoring normal spacing.

    Args:
        text (str): The tokenized text with spacing issues

    Returns:
        str: The detokenized text with normal spacing
    """
    # Replace literal \n with actual newlines
    text = text.replace('\\n', '\n')

    # Fix spaces before punctuation
    text = text.replace(' .', '.')
    text = text.replace(' ,', ',')
    text = text.replace(' :', ':')
    text = text.replace(' ;', ';')
    text = text.replace(' !', '!')
    text = text.replace(' ?', '?')
    text = text.replace(' )', ')')
    text = text.replace('( ', '(')

    # Fix spaces around hyphens and dashes
    text = text.replace(' - ', '-')
    text = text.replace(' -- ', ' -- ')  # Keep spaces around em-dashes

    # Fix possessives
    text = text.replace(" 's", "'s")
    text = text.replace(" n't", "n't")

    # Fix quotes - convert `` and '' to regular quotes
    # text = text.replace('``', '"')
    # text = text.replace("''", '"')
    # text = text.replace(' "', ' "')
    # text = text.replace('" ', '" ')

    import re
    text = text.replace(" ' ", "' ")
    text = re.sub(r"`` (.*?) ''", r'"\1"', text)
    # Fix numbers with colons (like ratios)
    text = re.sub(r'(\d+)\s+:\s+(\d+)\s+:\s+(\d+)', r'\1:\2:\3', text)

    # Fix coordinate formatting (degrees, minutes, seconds)
    text = re.sub(r'([\d\.]+)\s*°\s*([\d\.]+)\s*′\s*([\d\.]+)\s*\'\'\s*([NSEW])\s*', r'\1°\2′\3″\4 ', text)
    text = re.sub(r'([\d\.]+)\s*°\s*\s*([NSEW])\s*', r'\1°\2″ ', text)

    # Remove extra spaces
    text = re.sub(r' +', ' ', text)

    # Remove trailing/leading spaces on each line
    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)

    return text


# Example usage:
tokenized = "Sonoma diet : A diet based on portion control and centered around consuming `` power foods '' and Hans 's  door . 36 ° 5 ′ 30 '' N / 36.09167 ° N 115.17472 ° W "
detokenized = detokenize_text(tokenized)
print(detokenized)
# Output: Sonoma diet: A diet based on portion control and centered around consuming "power foods"


Sonoma diet: A diet based on portion control and centered around consuming "power foods" and Hans's door. 36°5′30″N / 36.09167°N″ 115.17472°W″


Looking at the user request, they want to:
1. Read a TSV file
2. Apply the `detokenize_text` function to columns 2 and 3 (indices 1 and 2)
3. Save the result as a JSON file using the headers from the TSV



The user wants to change the output format from JSON to JSONL (JSON Lines format), where each record is on a separate line without overall JSON array structure.



Looking at the user's request, they want to add tqdm progress bars to see the progress of the operations happening in the cell. I need to install tqdm first and then add progress bars to the apply operations on the DataFrame columns.



Looking at your request, you want to:
1. Read a JSONL file 
2. Apply `detokenize_text` to `input.text` and all `contexts[*].text` fields
3. Save the processed data as a JSON file



Looking at the user's request, they want to change the output format from JSON to JSONL (JSON Lines format), where each record is written as a separate JSON object on its own line.



Looking at the user request, they want to modify the cell to process all JSONL files in a directory instead of just a single file.



In [32]:
import json
import os
from tqdm import tqdm

# Directory containing JSONL files
directory_path = "../benchmark/nq_new/answerable_final/"

# Get all JSONL files in the directory
jsonl_files = [f for f in os.listdir(directory_path) if f.endswith('.jsonl') and not f.endswith('_processed.jsonl')]

print(f"Found {len(jsonl_files)} JSONL files to process: {jsonl_files}")

# Process each JSONL file
for jsonl_file in jsonl_files:
    jsonl_input_path = os.path.join(directory_path, jsonl_file)
    print(f"\nProcessing {jsonl_file}...")

    # Read JSONL file
    data = []
    with open(jsonl_input_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Reading {jsonl_file}"):
            if line.strip():
                data.append(json.loads(line.strip()))

    print(f"Loaded {len(data)} records from {jsonl_file}")

    # Process each record
    for record in tqdm(data, desc=f"Processing {jsonl_file}"):
        # Process input.text if it exists
        if 'input' in record and 'text' in record['input']:
            record['input']['text'] = detokenize_text(str(record['input']['text']))

        # Process contexts[*].text if contexts exist
        if 'contexts' in record and isinstance(record['contexts'], list):
            for context in record['contexts']:
                if isinstance(context, dict) and 'text' in context:
                    context['text'] = detokenize_text(str(context['text']))

    # Save to JSONL file
    jsonl_output_path = jsonl_input_path.replace('.jsonl', '_processed.jsonl')
    with open(jsonl_output_path, 'w', encoding='utf-8') as f:
        for record in tqdm(data, desc=f"Writing {jsonl_file}"):
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

    print(f"Successfully processed and saved to {os.path.basename(jsonl_output_path)}")
    print(f"Processed {len(data)} records")

print(f"\nCompleted processing all {len(jsonl_files)} JSONL files!")


Found 52 JSONL files to process: ['nq-train-37.jsonl', 'nq-train-15.jsonl', 'nq-train-07.jsonl', 'nq-dev-01.jsonl', 'nq-train-35.jsonl', 'nq-train-08.jsonl', 'nq-train-47.jsonl', 'nq-train-34.jsonl', 'nq-train-02.jsonl', 'nq-train-06.jsonl', 'nq-train-03.jsonl', 'nq-train-44.jsonl', 'nq-train-33.jsonl', 'nq-train-09.jsonl', 'nq-train-48.jsonl', 'nq-train-42.jsonl', 'nq-train-12.jsonl', 'nq-train-19.jsonl', 'nq-train-32.jsonl', 'nq-train-27.jsonl', 'nq-train-31.jsonl', 'nq-train-00.jsonl', 'nq-train-14.jsonl', 'nq-dev-00.jsonl', 'nq-train-01.jsonl', 'nq-train-17.jsonl', 'nq-train-30.jsonl', 'nq-train-04.jsonl', 'nq-train-40.jsonl', 'nq-train-26.jsonl', 'nq-train-28.jsonl', 'nq-train-11.jsonl', 'nq-train-39.jsonl', 'nq-train-25.jsonl', 'nq-train-41.jsonl', 'nq-train-49.jsonl', 'nq-train-16.jsonl', 'nq-train-29.jsonl', 'nq-train-45.jsonl', 'nq-train-23.jsonl', 'nq-train-18.jsonl', 'nq-train-10.jsonl', 'nq-train-24.jsonl', 'nq-train-38.jsonl', 'nq-train-22.jsonl', 'nq-train-46.jsonl', 'nq-

Reading nq-train-37.jsonl: 2150it [00:00, 126766.01it/s]


Loaded 2150 records from nq-train-37.jsonl


Processing nq-train-37.jsonl: 100%|██████████| 2150/2150 [00:00<00:00, 8571.87it/s]
Writing nq-train-37.jsonl: 100%|██████████| 2150/2150 [00:00<00:00, 70995.87it/s]


Successfully processed and saved to nq-train-37_processed.jsonl
Processed 2150 records

Processing nq-train-15.jsonl...


Reading nq-train-15.jsonl: 2097it [00:00, 88922.03it/s]


Loaded 2097 records from nq-train-15.jsonl


Processing nq-train-15.jsonl: 100%|██████████| 2097/2097 [00:00<00:00, 7303.93it/s]
Writing nq-train-15.jsonl: 100%|██████████| 2097/2097 [00:00<00:00, 76664.88it/s]


Successfully processed and saved to nq-train-15_processed.jsonl
Processed 2097 records

Processing nq-train-07.jsonl...


Reading nq-train-07.jsonl: 2100it [00:00, 124572.72it/s]


Loaded 2100 records from nq-train-07.jsonl


Processing nq-train-07.jsonl: 100%|██████████| 2100/2100 [00:00<00:00, 9081.23it/s]
Writing nq-train-07.jsonl: 100%|██████████| 2100/2100 [00:00<00:00, 64900.99it/s]


Successfully processed and saved to nq-train-07_processed.jsonl
Processed 2100 records

Processing nq-dev-01.jsonl...


Reading nq-dev-01.jsonl: 1279it [00:00, 88960.81it/s]


Loaded 1279 records from nq-dev-01.jsonl


Processing nq-dev-01.jsonl: 100%|██████████| 1279/1279 [00:00<00:00, 5669.64it/s]
Writing nq-dev-01.jsonl: 100%|██████████| 1279/1279 [00:00<00:00, 52898.75it/s]


Successfully processed and saved to nq-dev-01_processed.jsonl
Processed 1279 records

Processing nq-train-35.jsonl...


Reading nq-train-35.jsonl: 2065it [00:00, 120128.12it/s]


Loaded 2065 records from nq-train-35.jsonl


Processing nq-train-35.jsonl: 100%|██████████| 2065/2065 [00:00<00:00, 9674.85it/s]
Writing nq-train-35.jsonl: 100%|██████████| 2065/2065 [00:00<00:00, 65491.40it/s]


Successfully processed and saved to nq-train-35_processed.jsonl
Processed 2065 records

Processing nq-train-08.jsonl...


Reading nq-train-08.jsonl: 2127it [00:00, 103287.89it/s]


Loaded 2127 records from nq-train-08.jsonl


Processing nq-train-08.jsonl: 100%|██████████| 2127/2127 [00:00<00:00, 8483.98it/s]
Writing nq-train-08.jsonl: 100%|██████████| 2127/2127 [00:00<00:00, 75712.54it/s]


Successfully processed and saved to nq-train-08_processed.jsonl
Processed 2127 records

Processing nq-train-47.jsonl...


Reading nq-train-47.jsonl: 2155it [00:00, 114690.08it/s]


Loaded 2155 records from nq-train-47.jsonl


Processing nq-train-47.jsonl: 100%|██████████| 2155/2155 [00:00<00:00, 8079.60it/s]
Writing nq-train-47.jsonl: 100%|██████████| 2155/2155 [00:00<00:00, 62608.06it/s]


Successfully processed and saved to nq-train-47_processed.jsonl
Processed 2155 records

Processing nq-train-34.jsonl...


Reading nq-train-34.jsonl: 2121it [00:00, 102082.93it/s]


Loaded 2121 records from nq-train-34.jsonl


Processing nq-train-34.jsonl: 100%|██████████| 2121/2121 [00:00<00:00, 8405.96it/s]
Writing nq-train-34.jsonl: 100%|██████████| 2121/2121 [00:00<00:00, 53585.59it/s]


Successfully processed and saved to nq-train-34_processed.jsonl
Processed 2121 records

Processing nq-train-02.jsonl...


Reading nq-train-02.jsonl: 2136it [00:00, 14819.44it/s]


Loaded 2136 records from nq-train-02.jsonl


Processing nq-train-02.jsonl: 100%|██████████| 2136/2136 [00:00<00:00, 8717.52it/s]
Writing nq-train-02.jsonl: 100%|██████████| 2136/2136 [00:00<00:00, 74452.63it/s]


Successfully processed and saved to nq-train-02_processed.jsonl
Processed 2136 records

Processing nq-train-06.jsonl...


Reading nq-train-06.jsonl: 2110it [00:00, 114275.88it/s]


Loaded 2110 records from nq-train-06.jsonl


Processing nq-train-06.jsonl: 100%|██████████| 2110/2110 [00:00<00:00, 8771.98it/s]
Writing nq-train-06.jsonl: 100%|██████████| 2110/2110 [00:00<00:00, 72628.34it/s]


Successfully processed and saved to nq-train-06_processed.jsonl
Processed 2110 records

Processing nq-train-03.jsonl...


Reading nq-train-03.jsonl: 2083it [00:00, 135804.88it/s]


Loaded 2083 records from nq-train-03.jsonl


Processing nq-train-03.jsonl: 100%|██████████| 2083/2083 [00:00<00:00, 9416.65it/s] 
Writing nq-train-03.jsonl: 100%|██████████| 2083/2083 [00:00<00:00, 66720.65it/s]


Successfully processed and saved to nq-train-03_processed.jsonl
Processed 2083 records

Processing nq-train-44.jsonl...


Reading nq-train-44.jsonl: 2155it [00:00, 115915.27it/s]


Loaded 2155 records from nq-train-44.jsonl


Processing nq-train-44.jsonl: 100%|██████████| 2155/2155 [00:00<00:00, 10146.26it/s]
Writing nq-train-44.jsonl: 100%|██████████| 2155/2155 [00:00<00:00, 72656.23it/s]


Successfully processed and saved to nq-train-44_processed.jsonl
Processed 2155 records

Processing nq-train-33.jsonl...


Reading nq-train-33.jsonl: 2116it [00:00, 117543.84it/s]


Loaded 2116 records from nq-train-33.jsonl


Processing nq-train-33.jsonl: 100%|██████████| 2116/2116 [00:00<00:00, 9204.45it/s]
Writing nq-train-33.jsonl: 100%|██████████| 2116/2116 [00:00<00:00, 70351.13it/s]


Successfully processed and saved to nq-train-33_processed.jsonl
Processed 2116 records

Processing nq-train-09.jsonl...


Reading nq-train-09.jsonl: 2141it [00:00, 126441.54it/s]


Loaded 2141 records from nq-train-09.jsonl


Processing nq-train-09.jsonl: 100%|██████████| 2141/2141 [00:00<00:00, 7979.24it/s]
Writing nq-train-09.jsonl: 100%|██████████| 2141/2141 [00:00<00:00, 61714.85it/s]


Successfully processed and saved to nq-train-09_processed.jsonl
Processed 2141 records

Processing nq-train-48.jsonl...


Reading nq-train-48.jsonl: 2143it [00:00, 119747.85it/s]


Loaded 2143 records from nq-train-48.jsonl


Processing nq-train-48.jsonl: 100%|██████████| 2143/2143 [00:00<00:00, 7318.40it/s]
Writing nq-train-48.jsonl: 100%|██████████| 2143/2143 [00:00<00:00, 74291.00it/s]


Successfully processed and saved to nq-train-48_processed.jsonl
Processed 2143 records

Processing nq-train-42.jsonl...


Reading nq-train-42.jsonl: 2099it [00:00, 110336.30it/s]


Loaded 2099 records from nq-train-42.jsonl


Processing nq-train-42.jsonl: 100%|██████████| 2099/2099 [00:00<00:00, 9571.35it/s]
Writing nq-train-42.jsonl: 100%|██████████| 2099/2099 [00:00<00:00, 77104.96it/s]


Successfully processed and saved to nq-train-42_processed.jsonl
Processed 2099 records

Processing nq-train-12.jsonl...


Reading nq-train-12.jsonl: 2103it [00:00, 131033.06it/s]


Loaded 2103 records from nq-train-12.jsonl


Processing nq-train-12.jsonl: 100%|██████████| 2103/2103 [00:00<00:00, 9377.94it/s]
Writing nq-train-12.jsonl: 100%|██████████| 2103/2103 [00:00<00:00, 71123.72it/s]


Successfully processed and saved to nq-train-12_processed.jsonl
Processed 2103 records

Processing nq-train-19.jsonl...


Reading nq-train-19.jsonl: 2180it [00:00, 98438.76it/s]


Loaded 2180 records from nq-train-19.jsonl


Processing nq-train-19.jsonl: 100%|██████████| 2180/2180 [00:00<00:00, 8488.81it/s]
Writing nq-train-19.jsonl: 100%|██████████| 2180/2180 [00:00<00:00, 64692.11it/s]


Successfully processed and saved to nq-train-19_processed.jsonl
Processed 2180 records

Processing nq-train-32.jsonl...


Reading nq-train-32.jsonl: 2140it [00:00, 100898.29it/s]


Loaded 2140 records from nq-train-32.jsonl


Processing nq-train-32.jsonl: 100%|██████████| 2140/2140 [00:00<00:00, 8771.32it/s]
Writing nq-train-32.jsonl: 100%|██████████| 2140/2140 [00:00<00:00, 59066.15it/s]


Successfully processed and saved to nq-train-32_processed.jsonl
Processed 2140 records

Processing nq-train-27.jsonl...


Reading nq-train-27.jsonl: 2101it [00:00, 110496.83it/s]


Loaded 2101 records from nq-train-27.jsonl


Processing nq-train-27.jsonl: 100%|██████████| 2101/2101 [00:00<00:00, 9560.68it/s] 
Writing nq-train-27.jsonl: 100%|██████████| 2101/2101 [00:00<00:00, 72374.98it/s]


Successfully processed and saved to nq-train-27_processed.jsonl
Processed 2101 records

Processing nq-train-31.jsonl...


Reading nq-train-31.jsonl: 2089it [00:00, 110306.94it/s]


Loaded 2089 records from nq-train-31.jsonl


Processing nq-train-31.jsonl: 100%|██████████| 2089/2089 [00:00<00:00, 7482.60it/s]
Writing nq-train-31.jsonl: 100%|██████████| 2089/2089 [00:00<00:00, 54684.98it/s]


Successfully processed and saved to nq-train-31_processed.jsonl
Processed 2089 records

Processing nq-train-00.jsonl...


Reading nq-train-00.jsonl: 2137it [00:00, 85198.54it/s]


Loaded 2137 records from nq-train-00.jsonl


Processing nq-train-00.jsonl: 100%|██████████| 2137/2137 [00:00<00:00, 8888.75it/s]
Writing nq-train-00.jsonl: 100%|██████████| 2137/2137 [00:00<00:00, 60250.92it/s]


Successfully processed and saved to nq-train-00_processed.jsonl
Processed 2137 records

Processing nq-train-14.jsonl...


Reading nq-train-14.jsonl: 2116it [00:00, 16003.63it/s]


Loaded 2116 records from nq-train-14.jsonl


Processing nq-train-14.jsonl: 100%|██████████| 2116/2116 [00:00<00:00, 9588.92it/s]
Writing nq-train-14.jsonl: 100%|██████████| 2116/2116 [00:00<00:00, 78632.98it/s]


Successfully processed and saved to nq-train-14_processed.jsonl
Processed 2116 records

Processing nq-dev-00.jsonl...


Reading nq-dev-00.jsonl: 1299it [00:00, 113890.36it/s]


Loaded 1299 records from nq-dev-00.jsonl


Processing nq-dev-00.jsonl: 100%|██████████| 1299/1299 [00:00<00:00, 4818.35it/s]
Writing nq-dev-00.jsonl: 100%|██████████| 1299/1299 [00:00<00:00, 44528.72it/s]


Successfully processed and saved to nq-dev-00_processed.jsonl
Processed 1299 records

Processing nq-train-01.jsonl...


Reading nq-train-01.jsonl: 2087it [00:00, 93390.72it/s]


Loaded 2087 records from nq-train-01.jsonl


Processing nq-train-01.jsonl: 100%|██████████| 2087/2087 [00:00<00:00, 9196.36it/s]
Writing nq-train-01.jsonl: 100%|██████████| 2087/2087 [00:00<00:00, 76826.98it/s]


Successfully processed and saved to nq-train-01_processed.jsonl
Processed 2087 records

Processing nq-train-17.jsonl...


Reading nq-train-17.jsonl: 2134it [00:00, 125357.41it/s]


Loaded 2134 records from nq-train-17.jsonl


Processing nq-train-17.jsonl: 100%|██████████| 2134/2134 [00:00<00:00, 8476.63it/s]
Writing nq-train-17.jsonl: 100%|██████████| 2134/2134 [00:00<00:00, 74572.55it/s]


Successfully processed and saved to nq-train-17_processed.jsonl
Processed 2134 records

Processing nq-train-30.jsonl...


Reading nq-train-30.jsonl: 2108it [00:00, 111254.19it/s]


Loaded 2108 records from nq-train-30.jsonl


Processing nq-train-30.jsonl: 100%|██████████| 2108/2108 [00:00<00:00, 7987.33it/s]
Writing nq-train-30.jsonl: 100%|██████████| 2108/2108 [00:00<00:00, 54361.63it/s]


Successfully processed and saved to nq-train-30_processed.jsonl
Processed 2108 records

Processing nq-train-04.jsonl...


Reading nq-train-04.jsonl: 2092it [00:00, 84751.42it/s]


Loaded 2092 records from nq-train-04.jsonl


Processing nq-train-04.jsonl: 100%|██████████| 2092/2092 [00:00<00:00, 8665.75it/s]
Writing nq-train-04.jsonl: 100%|██████████| 2092/2092 [00:00<00:00, 74963.55it/s]


Successfully processed and saved to nq-train-04_processed.jsonl
Processed 2092 records

Processing nq-train-40.jsonl...


Reading nq-train-40.jsonl: 2093it [00:00, 110322.32it/s]


Loaded 2093 records from nq-train-40.jsonl


Processing nq-train-40.jsonl: 100%|██████████| 2093/2093 [00:00<00:00, 10060.65it/s]
Writing nq-train-40.jsonl: 100%|██████████| 2093/2093 [00:00<00:00, 73625.63it/s]


Successfully processed and saved to nq-train-40_processed.jsonl
Processed 2093 records

Processing nq-train-26.jsonl...


Reading nq-train-26.jsonl: 2073it [00:00, 127293.64it/s]


Loaded 2073 records from nq-train-26.jsonl


Processing nq-train-26.jsonl: 100%|██████████| 2073/2073 [00:00<00:00, 9891.17it/s] 
Writing nq-train-26.jsonl: 100%|██████████| 2073/2073 [00:00<00:00, 73960.46it/s]


Successfully processed and saved to nq-train-26_processed.jsonl
Processed 2073 records

Processing nq-train-28.jsonl...


Reading nq-train-28.jsonl: 2063it [00:00, 121067.97it/s]


Loaded 2063 records from nq-train-28.jsonl


Processing nq-train-28.jsonl: 100%|██████████| 2063/2063 [00:00<00:00, 8587.44it/s]
Writing nq-train-28.jsonl: 100%|██████████| 2063/2063 [00:00<00:00, 64026.88it/s]


Successfully processed and saved to nq-train-28_processed.jsonl
Processed 2063 records

Processing nq-train-11.jsonl...


Reading nq-train-11.jsonl: 2086it [00:00, 129416.30it/s]


Loaded 2086 records from nq-train-11.jsonl


Processing nq-train-11.jsonl: 100%|██████████| 2086/2086 [00:00<00:00, 9176.07it/s]
Writing nq-train-11.jsonl: 100%|██████████| 2086/2086 [00:00<00:00, 67018.91it/s]


Successfully processed and saved to nq-train-11_processed.jsonl
Processed 2086 records

Processing nq-train-39.jsonl...


Reading nq-train-39.jsonl: 2161it [00:00, 106691.75it/s]


Loaded 2161 records from nq-train-39.jsonl


Processing nq-train-39.jsonl: 100%|██████████| 2161/2161 [00:00<00:00, 8828.32it/s]
Writing nq-train-39.jsonl: 100%|██████████| 2161/2161 [00:00<00:00, 75029.73it/s]


Successfully processed and saved to nq-train-39_processed.jsonl
Processed 2161 records

Processing nq-train-25.jsonl...


Reading nq-train-25.jsonl: 2252it [00:00, 129625.80it/s]


Loaded 2252 records from nq-train-25.jsonl


Processing nq-train-25.jsonl: 100%|██████████| 2252/2252 [00:00<00:00, 7614.79it/s]
Writing nq-train-25.jsonl: 100%|██████████| 2252/2252 [00:00<00:00, 50202.89it/s]


Successfully processed and saved to nq-train-25_processed.jsonl
Processed 2252 records

Processing nq-train-41.jsonl...


Reading nq-train-41.jsonl: 2130it [00:00, 97412.20it/s]


Loaded 2130 records from nq-train-41.jsonl


Processing nq-train-41.jsonl: 100%|██████████| 2130/2130 [00:00<00:00, 8148.14it/s]
Writing nq-train-41.jsonl: 100%|██████████| 2130/2130 [00:00<00:00, 68448.26it/s]


Successfully processed and saved to nq-train-41_processed.jsonl
Processed 2130 records

Processing nq-train-49.jsonl...


Reading nq-train-49.jsonl: 2147it [00:00, 16645.02it/s]


Loaded 2147 records from nq-train-49.jsonl


Processing nq-train-49.jsonl: 100%|██████████| 2147/2147 [00:00<00:00, 8879.80it/s]
Writing nq-train-49.jsonl: 100%|██████████| 2147/2147 [00:00<00:00, 67586.60it/s]


Successfully processed and saved to nq-train-49_processed.jsonl
Processed 2147 records

Processing nq-train-16.jsonl...


Reading nq-train-16.jsonl: 2103it [00:00, 108943.63it/s]


Loaded 2103 records from nq-train-16.jsonl


Processing nq-train-16.jsonl: 100%|██████████| 2103/2103 [00:00<00:00, 8808.52it/s]
Writing nq-train-16.jsonl: 100%|██████████| 2103/2103 [00:00<00:00, 78589.25it/s]


Successfully processed and saved to nq-train-16_processed.jsonl
Processed 2103 records

Processing nq-train-29.jsonl...


Reading nq-train-29.jsonl: 2136it [00:00, 126240.47it/s]


Loaded 2136 records from nq-train-29.jsonl


Processing nq-train-29.jsonl: 100%|██████████| 2136/2136 [00:00<00:00, 8211.57it/s] 
Writing nq-train-29.jsonl: 100%|██████████| 2136/2136 [00:00<00:00, 75071.50it/s]


Successfully processed and saved to nq-train-29_processed.jsonl
Processed 2136 records

Processing nq-train-45.jsonl...


Reading nq-train-45.jsonl: 2109it [00:00, 140552.10it/s]


Loaded 2109 records from nq-train-45.jsonl


Processing nq-train-45.jsonl: 100%|██████████| 2109/2109 [00:00<00:00, 8593.42it/s]
Writing nq-train-45.jsonl: 100%|██████████| 2109/2109 [00:00<00:00, 77312.50it/s]


Successfully processed and saved to nq-train-45_processed.jsonl
Processed 2109 records

Processing nq-train-23.jsonl...


Reading nq-train-23.jsonl: 2176it [00:00, 113576.81it/s]


Loaded 2176 records from nq-train-23.jsonl


Processing nq-train-23.jsonl: 100%|██████████| 2176/2176 [00:00<00:00, 6242.33it/s]
Writing nq-train-23.jsonl: 100%|██████████| 2176/2176 [00:00<00:00, 68991.94it/s]


Successfully processed and saved to nq-train-23_processed.jsonl
Processed 2176 records

Processing nq-train-18.jsonl...


Reading nq-train-18.jsonl: 2095it [00:00, 114577.55it/s]


Loaded 2095 records from nq-train-18.jsonl


Processing nq-train-18.jsonl: 100%|██████████| 2095/2095 [00:00<00:00, 9587.07it/s]
Writing nq-train-18.jsonl: 100%|██████████| 2095/2095 [00:00<00:00, 69565.26it/s]


Successfully processed and saved to nq-train-18_processed.jsonl
Processed 2095 records

Processing nq-train-10.jsonl...


Reading nq-train-10.jsonl: 2131it [00:00, 124544.52it/s]


Loaded 2131 records from nq-train-10.jsonl


Processing nq-train-10.jsonl: 100%|██████████| 2131/2131 [00:00<00:00, 7584.41it/s]
Writing nq-train-10.jsonl: 100%|██████████| 2131/2131 [00:00<00:00, 64103.84it/s]


Successfully processed and saved to nq-train-10_processed.jsonl
Processed 2131 records

Processing nq-train-24.jsonl...


Reading nq-train-24.jsonl: 2082it [00:00, 111586.56it/s]


Loaded 2082 records from nq-train-24.jsonl


Processing nq-train-24.jsonl: 100%|██████████| 2082/2082 [00:00<00:00, 6870.77it/s]
Writing nq-train-24.jsonl: 100%|██████████| 2082/2082 [00:00<00:00, 58208.79it/s]


Successfully processed and saved to nq-train-24_processed.jsonl
Processed 2082 records

Processing nq-train-38.jsonl...


Reading nq-train-38.jsonl: 2138it [00:00, 103287.51it/s]


Loaded 2138 records from nq-train-38.jsonl


Processing nq-train-38.jsonl: 100%|██████████| 2138/2138 [00:00<00:00, 8550.56it/s]
Writing nq-train-38.jsonl: 100%|██████████| 2138/2138 [00:00<00:00, 64492.48it/s]


Successfully processed and saved to nq-train-38_processed.jsonl
Processed 2138 records

Processing nq-train-22.jsonl...


Reading nq-train-22.jsonl: 2113it [00:00, 113695.50it/s]


Loaded 2113 records from nq-train-22.jsonl


Processing nq-train-22.jsonl: 100%|██████████| 2113/2113 [00:00<00:00, 8835.79it/s]
Writing nq-train-22.jsonl: 100%|██████████| 2113/2113 [00:00<00:00, 77628.75it/s]


Successfully processed and saved to nq-train-22_processed.jsonl
Processed 2113 records

Processing nq-train-46.jsonl...


Reading nq-train-46.jsonl: 2086it [00:00, 127828.04it/s]


Loaded 2086 records from nq-train-46.jsonl


Processing nq-train-46.jsonl: 100%|██████████| 2086/2086 [00:00<00:00, 9549.58it/s]
Writing nq-train-46.jsonl: 100%|██████████| 2086/2086 [00:00<00:00, 65115.57it/s]


Successfully processed and saved to nq-train-46_processed.jsonl
Processed 2086 records

Processing nq-train-36.jsonl...


Reading nq-train-36.jsonl: 2149it [00:00, 14449.37it/s]


Loaded 2149 records from nq-train-36.jsonl


Processing nq-train-36.jsonl: 100%|██████████| 2149/2149 [00:00<00:00, 9433.11it/s]
Writing nq-train-36.jsonl: 100%|██████████| 2149/2149 [00:00<00:00, 77186.08it/s]


Successfully processed and saved to nq-train-36_processed.jsonl
Processed 2149 records

Processing nq-train-13.jsonl...


Reading nq-train-13.jsonl: 2142it [00:00, 133888.69it/s]


Loaded 2142 records from nq-train-13.jsonl


Processing nq-train-13.jsonl: 100%|██████████| 2142/2142 [00:00<00:00, 9284.35it/s]
Writing nq-train-13.jsonl: 100%|██████████| 2142/2142 [00:00<00:00, 76601.43it/s]


Successfully processed and saved to nq-train-13_processed.jsonl
Processed 2142 records

Processing nq-train-43.jsonl...


Reading nq-train-43.jsonl: 2098it [00:00, 127903.75it/s]


Loaded 2098 records from nq-train-43.jsonl


Processing nq-train-43.jsonl: 100%|██████████| 2098/2098 [00:00<00:00, 7775.88it/s]
Writing nq-train-43.jsonl: 100%|██████████| 2098/2098 [00:00<00:00, 70742.99it/s]


Successfully processed and saved to nq-train-43_processed.jsonl
Processed 2098 records

Processing nq-train-05.jsonl...


Reading nq-train-05.jsonl: 2087it [00:00, 127516.72it/s]


Loaded 2087 records from nq-train-05.jsonl


Processing nq-train-05.jsonl: 100%|██████████| 2087/2087 [00:00<00:00, 10111.81it/s]
Writing nq-train-05.jsonl: 100%|██████████| 2087/2087 [00:00<00:00, 79741.22it/s]


Successfully processed and saved to nq-train-05_processed.jsonl
Processed 2087 records

Processing nq-train-21.jsonl...


Reading nq-train-21.jsonl: 2129it [00:00, 125513.71it/s]


Loaded 2129 records from nq-train-21.jsonl


Processing nq-train-21.jsonl: 100%|██████████| 2129/2129 [00:00<00:00, 8933.13it/s]
Writing nq-train-21.jsonl: 100%|██████████| 2129/2129 [00:00<00:00, 72701.22it/s]


Successfully processed and saved to nq-train-21_processed.jsonl
Processed 2129 records

Processing nq-train-20.jsonl...


Reading nq-train-20.jsonl: 2051it [00:00, 109232.77it/s]


Loaded 2051 records from nq-train-20.jsonl


Processing nq-train-20.jsonl: 100%|██████████| 2051/2051 [00:00<00:00, 8729.84it/s]
Writing nq-train-20.jsonl: 100%|██████████| 2051/2051 [00:00<00:00, 72194.21it/s]

Successfully processed and saved to nq-train-20_processed.jsonl
Processed 2051 records

Completed processing all 52 JSONL files!





Looking at the user request, they want to:
1. Read a TSV file
2. Apply the `detokenize_text` function to columns 2 and 3 (indices 1 and 2)
3. Save as JSONL format using headers
4. Show progress with tqdm
5. Don't force ASCII in output



In [33]:
import pandas as pd
from tqdm import tqdm
import json

# Read TSV file
tsv_file_path = "../benchmark/clapnq/fixed_passages.tsv"
df = pd.read_csv(tsv_file_path, sep='\t')

print(f"Loaded {len(df)} records from TSV file")
print(f"Columns: {list(df.columns)}")

# Apply detokenize_text to second and third columns (indices 1 and 2)
tqdm.pandas(desc="Processing column 2")
df.iloc[:, 1] = df.iloc[:, 1].astype(str).progress_apply(detokenize_text)

tqdm.pandas(desc="Processing column 3")
df.iloc[:, 2] = df.iloc[:, 2].astype(str).progress_apply(detokenize_text)

# Convert to JSONL format
jsonl_output_path = tsv_file_path.replace('.tsv', '.jsonl')
with open(jsonl_output_path, 'w', encoding='utf-8') as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Writing JSONL"):
        record = row.to_dict()
        f.write(json.dumps(record, ensure_ascii=False) + '\n')

print(f"Successfully saved {len(df)} records to {jsonl_output_path}")


Loaded 178890 records from TSV file
Columns: ['id', 'text', 'title', 'docid']


Processing column 2: 100%|██████████| 178890/178890 [00:09<00:00, 18075.31it/s]
Processing column 3: 100%|██████████| 178890/178890 [00:00<00:00, 233339.14it/s]
Writing JSONL: 100%|██████████| 178890/178890 [00:07<00:00, 25265.18it/s]

Successfully saved 178890 records to ../benchmark/clapnq/fixed_passages.jsonl



