In [1]:
import json
import os
import boto3
import requests
from botocore.exceptions import ClientError
from bs4 import BeautifulSoup
import urllib.parse
import logging

# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

class S3FileSynchronizer:
    def __init__(self, s3_bucket, source_url):
        """
        Initializing synchronizer with S3 and source URL details
        """
        self.s3_client = boto3.client('s3')
        self.s3_bucket = s3_bucket
        self.source_url = source_url.rstrip('/')
        self.headers = {
            'User-Agent': 'MyCustomScript/1.0 email me at pandeyrajnish86@yahoo.com',
            'Referer': 'https://download.bls.gov/'
        }
    
    def get_remote_file_list(self):
        """
        Fetching list of files from source URL for Part 2
        """
        try:
            response = requests.get(self.source_url, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            files = [
                link.get('href') for link in soup.find_all('a') 
                if link.get('href') 
                and not link.get('href').endswith('/')
                and link.get('href') != '../'
            ]
            
            return files
        except requests.RequestException as e:
            logger.error(f"Error fetching file list: {e}")
            return []
    
    def get_s3_file_list(self):
        """
        Getting list of files in S3 bucket for Part 2
        """
        try:
            response = self.s3_client.list_objects_v2(Bucket=self.s3_bucket)
            return [obj['Key'] for obj in response.get('Contents', [])]
        except ClientError as e:
            logger.error(f"Error listing S3 objects: {e}")
            return []
    
    def download_file(self, filename):
        """
        Downloading files from source URL for Part 1
        """
        try:
            full_url = urllib.parse.urljoin(self.source_url + '/', filename)
            response = requests.get(full_url, headers=self.headers)
            response.raise_for_status()
            return response.content
        except requests.RequestException as e:
            logger.error(f"Error downloading {filename}: {e}")
            return None
    
    def upload_to_s3(self, filename, content):
        """
        Uploading file to S3 bucket for Part 1
        """
        try:
            self.s3_client.put_object(
                Bucket=self.s3_bucket,
                Key=filename,
                Body=content
            )
            logger.info(f"Uploaded {filename} to S3")
        except ClientError as e:
            logger.error(f"Error uploading {filename} to S3: {e}")
    
    def delete_from_s3(self, filename):
        """
        Deleting file from S3 bucket for Part 1
        """
        try:
            self.s3_client.delete_object(
                Bucket=self.s3_bucket,
                Key=filename
            )
            logger.info(f"Deleted {filename} from S3")
        except ClientError as e:
            logger.error(f"Error deleting {filename} from S3: {e}")
    
    def sync(self):
        """
        Synchronizing files between source URL and S3 bucket for part 1
        """
        remote_files = self.get_remote_file_list()
        s3_files = self.get_s3_file_list()
        
        # Add/Update files
        for filename in remote_files:
            content = self.download_file(filename)
            if not content:
                continue
            
            if filename not in s3_files:
                self.upload_to_s3(filename, content)
        
        # Delete files no longer in source
        for filename in s3_files:
            if filename not in remote_files:
                self.delete_from_s3(filename)

def lambda_handler(event, context):
    """
    Lambda function handler
    """
    S3_BUCKET = os.environ['S3_BUCKET']
    SOURCE_URL = os.environ['SOURCE_URL']
    
    try:
        synchronizer = S3FileSynchronizer(S3_BUCKET, SOURCE_URL)
        synchronizer.sync()
        
        return {
            'statusCode': 200,
            'body': json.dumps('Synchronization completed successfully')
        }
    except Exception as e:
        logger.error(f"Synchronization failed: {e}")
        return {
            'statusCode': 500,
            'body': json.dumps(f'Error: {str(e)}')
        }