In [0]:
import urllib.request
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

In [0]:
def import_file(base_url, target_folder, filename):
    """Download a single file and save to Databricks Volume"""
    try:
        file_url = base_url + filename
        
        
        # NEW DATABRICKS CODE:
        volume_path = target_folder + filename
        
        # Fetch file from source with User-Agent header to comply with BLS policies
        req = urllib.request.Request(file_url, headers={'User-Agent': 'your-email@example.com'})
        with urllib.request.urlopen(req) as response:
            file_content = response.read()
            last_modified = response.headers.get("Last-Modified")
        print(last_modified)
        # Parse source modification time using datetime.strptime
        from datetime import datetime
        source_dt = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
        source_timestamp_ms = int(source_dt.timestamp() * 1000)
        
        # Check if file already exists in Volume with modification time >= source time
        try:
            existing_files = dbutils.fs.ls(target_folder)
            # Filter files that match filename and have modification time >= source time
            up_to_date_files = [
                f for f in existing_files 
                if f.name == filename and f.modificationTime >= source_timestamp_ms
            ]
            
            if up_to_date_files:
                print(f"✓ File already up-to-date: {volume_path}")
                return filename
        except Exception:
            pass  # Target folder doesn't exist or is empty
        
        # Write file to Volume using dbutils
        dbutils.fs.put(volume_path, file_content.decode('latin-1'), overwrite=True)
        print(f"✓ Imported file: {volume_path}")
        
        return filename
    except Exception as e:
        raise Exception(f"✗ Failed to import file: {filename}; {str(e)}")





In [0]:
def create_index_html_file(target_folder):
    """Create an index.html file in the Databricks Volume"""
    try:
        # OLD AWS S3 CODE (commented out):
        # s3 = boto3.client('s3')
        # response = s3.list_objects_v2(Bucket=bucket_name, Prefix=target_folder)
        # keys = [obj['Key'][len(target_folder):] for obj in response.get('Contents', [])]
        # target_files = {filename for filename in keys if filename}
        # 
        # html_content = "<html><body><ul>"
        # for filename in target_files:
        #     html_content += f"<li><a href='{filename}'>{filename}</a></li>"
        # html_content += "</ul></body></html>"
        # 
        # s3.put_object(Bucket=bucket_name, Key=target_folder + 'index.html', Body=html_content, ContentType='text/html')
        # print(f"✓ Created index.html file")
        
        # NEW DATABRICKS CODE:
        # List files in the Volume folder
        existing_files = dbutils.fs.ls(target_folder)
        target_files = {f.name for f in existing_files if f.name}
        
        # Generate HTML content
        html_content = "<html><body><ul>"
        for filename in target_files:
            html_content += f"<li><a href='{filename}'>{filename}</a></li>"
        html_content += "</ul></body></html>"
        
        # Write index.html to Volume
        index_path = target_folder + 'index.html'
        dbutils.fs.put(index_path, html_content, overwrite=True)
        print(f"✓ Created index.html file")
        
    except Exception as e:
        raise Exception(f"✗ Failed to create index.html file; {str(e)}")

In [0]:
try:
    # Get configurations
    base_url = 'https://download.bls.gov/pub/time.series/pr/'
    target_folder = '/Volumes/rearcquest/default/bls_data_raw/'  

    # Open link and scrape source filelist
    req = urllib.request.Request(base_url, headers={'User-Agent': 'your-email@example.com'})
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')

    # Get source filelist
    pattern = r'<a[^>]*href="[^"]*">(pr[^<]+)</a>'
    source_files = re.findall(pattern, html, flags=re.IGNORECASE)
    source_files = {f for f in source_files if '://' not in f and not f.endswith('/')}
    
    # Get target filelist from Volume
    try:
        existing_files = dbutils.fs.ls(target_folder)
        target_files = {f.name for f in existing_files if f.name}
    except Exception:
        target_files = set()  # Target folder doesn't exist yet

    files_to_delete = {f for f in target_files - source_files if f != 'index.html'}   

    # Import files
    import_errors = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(import_file, base_url, target_folder, f) 
                    for f in source_files]
        
        for future in as_completed(futures):
            try:
                filename = future.result()
            except Exception as e:
                print(str(e))
                import_errors.append(str(e))
    if import_errors:
        raise Exception(f"✗ Failed to import files: {', '.join(import_errors)}")
    
    # Delete files
    delete_errors = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(delete_file, target_folder, f) 
                    for f in files_to_delete]
        
        for future in as_completed(futures):
            try:
                filename = future.result()
            except Exception as e:
                print(str(e))
                delete_errors.append(str(e))
    if delete_errors:
        raise Exception(f"✗ Failed to delete files: {', '.join(delete_errors)}")

    # Create index file
    create_index_html_file(target_folder)

    print(f"✓ Sync complete! Files are in {target_folder}")
        
except Exception as e:
    raise Exception(f"Error: {str(e)}")
    # Todo - better error handling