In [None]:
!pip install beautifulsoup4 ftfy -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import random
import re
from bs4 import BeautifulSoup
import ftfy

input_file_path = '/content/drive/MyDrive/trn.json'
output_file_path = '/content/drive/MyDrive/formatted_amazon_products_50k_sanitized.jsonl'
num_records_to_save = 50000

def sanitize_text(text):
    """
    Cleans a string by fixing encoding, removing HTML, and normalizing whitespace.
    """
    if not isinstance(text, str):
        return ""

    text = ftfy.fix_text(text)

    text = BeautifulSoup(text, "html.parser").get_text()

    text = ' '.join(text.split()).strip()

    return text

valid_records = []

print(f"--- Phase 1: Reading and sanitizing '{input_file_path}' ---")
try:
    with open(input_file_path, 'r', encoding='utf-8') as f_in:
        for i, line in enumerate(f_in):
            try:
                data = json.loads(line)

                if data.get('title') and data.get('content'):
                    clean_title = sanitize_text(data['title'])
                    clean_content = sanitize_text(data['content'])

                    if clean_title and clean_content:
                        new_record = {
                            'title': clean_title,
                            'content': clean_content
                        }
                        valid_records.append(new_record)

                if (i + 1) % 200000 == 0:
                    print(f"Scanned {i+1} lines...")

            except (json.JSONDecodeError, TypeError):
                continue

except FileNotFoundError:
    print(f"❌ Error: The file '{input_file_path}' was not found.")
    exit()

total_valid = len(valid_records)
print(f"✅ Phase 1 complete. Found {total_valid} valid and sanitized records.\n")

print(f"--- Phase 2: Selecting records to save ---")
final_records = []

if total_valid == 0:
    print("No valid records were found. The output file will be empty.")
elif total_valid > num_records_to_save:
    print(f"Found more than {num_records_to_save} records. Taking a random sample.")
    final_records = random.sample(valid_records, num_records_to_save)
else:
    print(f"Found {total_valid} records, which is less than or equal to {num_records_to_save}. Saving all of them.")
    final_records = valid_records

print(f"✅ Phase 2 complete. Selected {len(final_records)} records.\n")

print(f"--- Phase 3: Writing {len(final_records)} records to '{output_file_path}' ---")
with open(output_file_path, 'w', encoding='utf-8') as f_out:
    for record in final_records:
        f_out.write(json.dumps(record) + '\n')

print("\n----------------------------------------------------")
print(f"🎉 Success! A sanitized sample of {len(final_records)} records has been saved.")


--- Phase 1: Reading and sanitizing '/content/drive/MyDrive/trn.json' ---



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, "html.parser").get_text()

If you meant to use Beautiful Soup to parse the contents of a file on disk, then something has gone wrong. You should open the file first, using code like this:

    filehandle = open(your filename)

You can then feed the open filehandle into Beautiful Soup instead of using the filename.



    
  text = BeautifulSoup(text, "html.parser").get_text()


Scanned 200000 lines...
Scanned 400000 lines...
Scanned 600000 lines...
Scanned 800000 lines...
Scanned 1000000 lines...
Scanned 1200000 lines...
Scanned 1400000 lines...
Scanned 1600000 lines...
Scanned 1800000 lines...
Scanned 2000000 lines...
Scanned 2200000 lines...
✅ Phase 1 complete. Found 1390076 valid and sanitized records.

--- Phase 2: Selecting records to save ---
Found more than 50000 records. Taking a random sample.
✅ Phase 2 complete. Selected 50000 records.

--- Phase 3: Writing 50000 records to '/content/drive/MyDrive/formatted_amazon_products_50k_sanitized.jsonl' ---

----------------------------------------------------
🎉 Success! A sanitized sample of 50000 records has been saved.
