## Interacting with Common AWS services using Python 3 ##

**This notebook will capture how to :**
1. connect to create buckets in s3.
2. Listing buckets in s3
3. How to connect to existing buckets in s3 and read files in them.
4. How to download files from s3 onto local computer
5. How to copy files from one bucket to another
6. Deleting s3 buckets

In [2]:
%%writefile s3_xlsx_csv.py
import boto3
import pandas as pd
from io import StringIO
import csv

Writing s3_xlsx_csv.py


### Connecting to s3 ###

In [2]:
%%writefile -a s3_xlsx_csv.py
s3_resource = boto3.resource('s3')

In [3]:
#list available buckets
for bucket in s3_resource.buckets.all():
    print(bucket.name)

aws-emr-resources-910991713532-us-west-1
aws-logs-910991713532-us-west-1
dataeng-capstone-1
faraz-bucket-a-20200712
faraz-test-bucket-20200712
fk-new-bucket-20200711
sparkify-fk
sparkify-fk3
sparkify-fk4


In [4]:
##%%writefile -a s3_xlsx_csv.py

#%%time
#df = pd.read_csv('s3://dataeng-capstone-1/h1b_disclosure_data_2017_2018.dat',sep="|")
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='dataeng-capstone-1', Key='all_data_M_2017.xlsx')
df = pd.read_excel(obj['Body'].read())
#df.head()

CPU times: user 1min 46s, sys: 1.37 s, total: 1min 47s
Wall time: 1min 59s


Unnamed: 0,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,...,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,99,U.S.,1,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,total,...,18.12,29.38,46.23,19970,24770,37690,61110,96150,,
1,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,...,49.32,71.83,#,48220,69880,102590,149410,#,,
2,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,minor,...,49.58,78.72,#,43140,66030,103120,163740,#,,
3,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,broad,...,88.11,#,#,68110,113470,183270,#,#,,
4,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,...,88.11,#,#,68110,113470,183270,#,#,,


In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [6]:
df.insert(0, 'FY_YEAR', 2017)

In [7]:
df.head()

Unnamed: 0,FY_YEAR,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,tot_emp,emp_prse,jobs_1000,loc_quotient,pct_total,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,total,142549250,0.1,,,,24.34,50620,0.1,9.6,11.91,18.12,29.38,46.23,19970,24770,37690,61110,96150,,
1,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,7280330,0.2,,,,57.65,119910,0.1,23.19,33.6,49.32,71.83,#,48220,69880,102590,149410,#,,
2,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,minor,2473740,0.3,,,,61.55,128020,0.2,20.74,31.74,49.58,78.72,#,43140,66030,103120,163740,#,,
3,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,broad,210160,0.7,,,,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,,
4,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,210160,0.7,,,,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,,


In [8]:
%%time
csv_buffer = StringIO()
df.to_csv(csv_buffer, sep="|",index=False,quoting=csv.QUOTE_NONNUMERIC)
s3_resource.Object('dataeng-capstone-1', 'clean/all_data_M_2017.dat').put(Body=csv_buffer.getvalue())

CPU times: user 4.87 s, sys: 465 ms, total: 5.34 s
Wall time: 22.1 s


{'ResponseMetadata': {'RequestId': '750C33D060308C8B',
  'HostId': 'mEnG+y2IVSQRYc7QR48JoV1TEwoMR8fzg+hfANA5780KF29yR5VGDnCQX3rF/mRMPaFlW8etpqw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'mEnG+y2IVSQRYc7QR48JoV1TEwoMR8fzg+hfANA5780KF29yR5VGDnCQX3rF/mRMPaFlW8etpqw=',
   'x-amz-request-id': '750C33D060308C8B',
   'date': 'Wed, 14 Oct 2020 05:25:56 GMT',
   'etag': '"0baa85ba1f375ea4aa141f4398c21eba"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'ETag': '"0baa85ba1f375ea4aa141f4398c21eba"'}

In [6]:
%%time
#df = pd.read_csv('s3://dataeng-capstone-1/h1b_disclosure_data_2017_2018.dat',sep="|")
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='dataeng-capstone-1', Key='test/all_data_M_2018.dat')
df = pd.read_csv(obj['Body'],sep="|")
df.head()



CPU times: user 2.23 s, sys: 401 ms, total: 2.63 s
Wall time: 1min 22s


Unnamed: 0,FY_YEAR,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,...,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,2018,99,U.S.,1,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,18.58,30.06,47.31,20690,25740,38640,62510,98410,,
1,2018,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,50.11,72.93,#,49260,70880,104240,151700,#,,
2,2018,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,49.73,78.81,#,43400,66000,103450,163930,#,,
3,2018,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,91.15,#,#,68360,115960,189600,#,#,,
4,2018,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,91.15,#,#,68360,115960,189600,#,#,,


In [12]:
df.head(20)

Unnamed: 0,FY_YEAR,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,tot_emp,emp_prse,jobs_1000,loc_quotient,pct_total,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,total,142549250,0.1,,,,24.34,50620,0.1,9.6,11.91,18.12,29.38,46.23,19970,24770,37690,61110,96150,,
1,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,7280330,0.2,,,,57.65,119910,0.1,23.19,33.6,49.32,71.83,#,48220,69880,102590,149410,#,,
2,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,minor,2473740,0.3,,,,61.55,128020,0.2,20.74,31.74,49.58,78.72,#,43140,66030,103120,163740,#,,
3,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,broad,210160,0.7,,,,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,,
4,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,210160,0.7,,,,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,,
5,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1020,General and Operations Managers,broad,2212200,0.3,,,,59.35,123460,0.2,21.4,31.53,48.27,75.28,#,44510,65590,100410,156580,#,,
6,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,detailed,2212200,0.3,,,,59.35,123460,0.2,21.4,31.53,48.27,75.28,#,44510,65590,100410,156580,#,,
7,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1030,Legislators,broad,51380,1.3,,,,*,46350,1.0,*,*,*,*,*,17480,18860,25630,70130,97510,1.0,
8,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-1031,Legislators,detailed,51380,1.3,,,,*,46350,1.0,*,*,*,*,*,17480,18860,25630,70130,97510,1.0,
9,2017,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-2000,"Advertising, Marketing, Promotions, Public Rel...",minor,685780,0.5,,,,66.66,138650,0.3,29.1,41.13,59.19,82.51,#,60530,85560,123100,171630,#,,


In [8]:
%%writefile s3_xlsx_to_csv.py
import boto3
import pandas as pd
from io import StringIO
import csv
import time

def xlsx_to_csv(years):
    """Convert xlsx file in aws s3 bucket to pipe delimited .dat files after applying minor data transformations to prep the data for redshift staging"""
    
    start_time = time.time()
    
    s3_resource = boto3.resource('s3')
    s3 = boto3.client('s3')
    for year in years:
        file = 'all_data_M_{}'.format(year)
        obj = s3.get_object(Bucket='dataeng-capstone-1', Key='all_data_M_{}.xlsx'.format(year))
        df = pd.read_excel(obj['Body'].read())
        df.insert(0, 'FY_YEAR', int('{}'.format(year)))             
        csv_buffer = StringIO()
        df.to_csv(csv_buffer, sep="|",index=False,quoting=csv.QUOTE_NONNUMERIC)
        s3_resource.Object('dataeng-capstone-1', 'clean/all_data_M_{}.dat'.format(year)).put(Body=csv_buffer.getvalue())
        print('converted ',file,' from .xlsx to .dat')
    
    
    file =  '2017_NAICS_Descriptions' 
    obj = s3.get_object(Bucket='dataeng-capstone-1', Key='2017_NAICS_Descriptions.xlsx')
    df = pd.read_excel(obj['Body'].read())
    df = df[['Code','Title']]    

    csv_buffer = StringIO()
    df.to_csv(csv_buffer, sep="|",index=False)
    s3_resource.Object('dataeng-capstone-1', 'clean/all_naics_codes.dat').put(Body=csv_buffer.getvalue())
    print('converted ',file,' from .xlsx to .dat')

    end_time = time.time()
    
    runtime = end_time - start_time
    
    print('\n')
    print('runtime: ',runtime)
    print('\n')
    dataend_bucket = s3_resource.Bucket('dataeng-capstone-1')

    print('List files in clean bucket: ')
    for objct in dataend_bucket.objects.filter(Delimiter='/',Prefix='clean/all'):
        print(objct.key)
    
    print('\n')

def main():
    year_list = ['2017','2018']
    
    xlsx_to_csv(year_list)
    
if __name__ == "__main__":
    main()


Writing s3_xlsx_to_csv.py


In [None]:
years = ['2017','2018']