In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 2: Preprocessing Transportation Documents
This notebook loads raw files from `/data/raw/`, cleans and chunks them for embedding or fine-tuning.

In [None]:
import os
import re
import glob
from pathlib import Path

RAW_DIR = '../data/raw'
CLEAN_DIR = '../data/cleaned'
os.makedirs(CLEAN_DIR, exist_ok=True)


## Load and Clean Text Files

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

for file_path in glob.glob(f'{RAW_DIR}/*.txt'):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        raw_text = f.read()
    cleaned = clean_text(raw_text)
    fname = Path(file_path).stem + '_cleaned.txt'
    with open(os.path.join(CLEAN_DIR, fname), 'w', encoding='utf-8') as out:
        out.write(cleaned)
    print(f'Cleaned: {fname}')

## Optional: Split Text into Chunks

In [None]:
CHUNK_SIZE = 1000  # characters
OVERLAP = 200

def split_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    return [text[i:i+size] for i in range(0, len(text), size - overlap)]

for file_path in glob.glob(f'{CLEAN_DIR}/*_cleaned.txt'):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    chunks = split_text(text)
    for i, chunk in enumerate(chunks):
        out_path = file_path.replace('_cleaned.txt', f'_chunk_{i}.txt')
        with open(out_path, 'w', encoding='utf-8') as out:
            out.write(chunk)
    print(f'Chunked: {file_path}')