# Download Low Rate Data Logger Files From AWS S3 - All Data for a Date Range

Download files from a logger for a specific date range, and concatenate to a single file.

- For a specific logger:
    - Construct a list of filenames
    - Download each day file
    - Concatenate all of these
    - Remove the header for each day
- Data file produced is called as 'loggername_Table1.csv'

**This version generates exact filenames to download, needs a date range**

In [1]:
import boto3
import os
import glob
import shutil
import pandas as pd
from datetime import datetime

In [2]:
def remove_duplines(tmpfile, completefile):
    lines_seen = [] # holds lines already seen
    outfile = open(completefile, 'w')
    for line in open(tmpfile, 'r'):
        if line not in lines_seen: # not a duplicate
            outfile.write(line)
            lines_seen.append(line)
    outfile.close()

In [None]:
#authentication for S3
sts_client = boto3.client('sts')

mfa_TOTP = input("Enter the MFA code: ")

# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.
assumed_role_object=sts_client.assume_role(
    RoleArn="XXXXX",
    RoleSessionName="DataLoggerRole",
    SerialNumber="XXXXX",
    TokenCode=mfa_TOTP
)

# From the response that contains the assumed role, get the temporary 
# credentials that can be used to make subsequent API calls
credentials=assumed_role_object['Credentials']

s3=boto3.resource('s3',
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken'],
)

In [None]:
#S3 bucket name
bucket = 'dev-data-logger-lake.geonet.org.nz'

#folder for downloaded daily CSV files
dltmp = '/home/sherburn/GeoNet/datalogger/auto_download/tmp'
#top folder to save final CSV files
dlsav = '/home/sherburn/GeoNet/datalogger/auto_download'

#temporary file, concatenated but with daily headers
tmpfile = os.path.join(dlsav, 'tmpfile.csv')

#list of loggers to download data from
loggers = ['infernocratertest', 'lowertemaari', 'foxglacierlandslide']

In [None]:
#date range for data
date1 = '20190701'
date2 = '20190801'

In [None]:
#construct and format the range of dates
dr = pd.date_range(date1, date2, freq='D', )
dates = dr.map(lambda x: x.strftime('%Y/%m/%d'))

In [None]:
#do the work

#loop for each logger 
for logger in loggers:
    print ('downloading from S3:', logger)
    
    os.makedirs(dltmp, exist_ok=True) #make tmp directory for downloaded files
    #loop for each date
    for date in dates:
        date2 = (datetime.strptime(date, '%Y/%m/%d')).strftime('%Y%m%d')
        s3file = date+'/'+'logger-'+logger+'_Table1'+'_'+date2+'.csv'
        #print (s3file)
        savefile = 'logger-'+logger+'_Table1'+'_'+date2+'.csv'
        try:
            s3.Bucket(bucket).download_file(s3file, os.path.join(dltmp, savefile))
        except:
            print ('fail to download '+s3file)
            pass

    #concat all files for the logger
    concatfile = tmpfile
    files = glob.glob(os.path.join(dltmp, '*.csv'))
    files.sort() #to get data in time order
    with open(concatfile, 'w') as outfile:
        for file in files:
            with open(file, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)

    shutil.rmtree(dltmp)#remove tmp directory for downloaded files
    
    #remove unwanted header lines from temporary file
    completefile = os.path.join(dlsav, logger, logger+'_Table1.csv')
    remove_duplines(tmpfile, completefile)
    #remove temporary file
    os.remove(tmpfile)