# Saving the files to S3 Bucket

### Importing the packages

In [None]:
import re
import boto3
import requests
from bs4 import BeautifulSoup
import functools

#### A decorator function for adding new methods to class

In [None]:
def update_class(
    main_class=None, exclude=("__module__", "__name__","__dict__","__weakref__")
):
    def decorates(main_class, exclude, appended_class):
        if main_class is None:
            main_class = globals()[appended_class.__name__]
        for k, v in appended_class.__dict__.items():
            if k not in exclude:
                setattr(main_class, k, v)
        return main_class
    return functools.partial(decorates, main_class, exclude)

### Class constructor

In [None]:
class manage_s3():
    def __init__(self,bucket_name,url,key=None):
        self.bucket_name = bucket_name
        self.url = url

        # AWS credentials as not needed as this script will run on AWS.
        # For running on local machines please uncomment the following lines.
        
        if key:
            with open(key, "r") as f:
                self.AK,self.SK = [x.split()[0] for x in f.readlines()[-1].split(',')]
            self.s3 = boto3.resource('s3',              
                aws_access_key_id=self.AK, aws_secret_access_key=self.SK
            )
        else:
            self.s3 = boto3.resource('s3')

### Get the file names

In [None]:
@update_class()
class manage_s3():
    def get_name(self):
        soup = BeautifulSoup(requests.get(self.url).text, "lxml")
        print("Reading file names complete.")
        return [page.string for page in soup.findAll('a', href=re.compile(''))[1:]]

### Read the files from S3

In [None]:
@update_class()
class manage_s3():
    def read_s3(self):
        ret_dict = {}
        # Create bucket if not exist, else get the bucket.
        bucket = self.s3.create_bucket(Bucket=self.bucket_name)
        for i,obj in enumerate(bucket.objects.all()):
            ret_dict[obj.key] = obj.get()['Body'].read()
        print("Reading s3 complete.")
        return ret_dict

### Sync the files

In [None]:
@update_class()
class manage_s3():
    def sync_files(self):
        files = self.get_name()
        s3_files = self.read_s3()
        file_name = s3_files.keys()
        
        print("Uploading/Updating files to s3")
        
        for i, f in enumerate(files):
            file = f'dataset/{f}'
            with requests.get(self.url+f, stream=True) as r:
                if f not in file_name:
                    self.s3.Object(self.bucket_name, file).put(Body=r.content)
                    print(f"{i+1}) {file} uploaded")
                else:
                    if r.content != s3_files[f]:
                        self.s3.Object(self.bucket_name, f).put(Body=r.content)
                        print(f"{i+1}) {file} updated")
                    else:
                        print(f"{i+1}) {file} skipped")
        
        print("Deleting files from s3")
        
        del_f = [f for f in file_name if f not in files]
        for i, f in enumerate(del_f):
            file = f'dataset/{f}'
            self.s3.Object(self.bucket_name, f).delete()
            print(f"{i+1}) {f} deleted")


### Add files to any S3

In [None]:
@update_class()
class manage_s3():
    def new_s3_add_files(self, bucket_name, api, key):
        r = requests.get(api).text
#         Create the bucket if not exists.
        _ = self.s3.create_bucket(Bucket=bucket_name) 
        self.s3.Object(bucket_name, key).put(Body=r)
        print(f"Data from given API is written to {bucket_name} bucket.")

### Execution

In [None]:
key = "srd22_accessKeys.csv"
bucket_name = "s1quest"
res_url = "https://download.bls.gov/pub/time.series/pr/"

s = manage_s3(bucket_name, res_url, key)
s.sync_files()

In [None]:
new_bucket = "s2quest"
api = "https://datausa.io/api/data?drilldowns=Nation&measures=Population"
file_key = "data.json"
s.new_s3_add_files(new_bucket, api,file_key)