In [None]:
!pip install pyarrow

In [159]:
import boto3
import pandas as pd
import pyarrow as pa
from s3fs import S3FileSystem
import pyarrow.parquet as pq
import os
import time
from datetime import datetime as dt

In [5]:
client_id = 'BCMA'
received_date = '2019-01-01'

# declaring variables
client_id = client_id.lower()
received_date = 'RD-'+pd.to_datetime(received_date).strftime("%Y-%m-%d")
database = client_id+'_'+received_date
s3_path = os.path.join('s3://prospect-raw-files/', client_id, received_date)

# Read and Write Functions from CSV to Parquet    

In [181]:
def read_objects():
    """
    
    """
    all_objects = s3.list_objects(Bucket='prospect-raw-files')
    
    target_files = lambda key: '.csv' in key and received_date in key
    target_keys = list(filter(target_files, [cont.get('Key') for cont in all_objects['Contents']]))
    
    obj_dict = {'Body': [], 'Key': [], 'DelKey': []}
    for key in target_keys:
        obj = s3.get_object(Bucket='prospect-raw-files', Key=key)
        obj_dict['Body'].append(obj['Body'])
        obj_dict['Key'].append(os.path.dirname(key))
        obj_dict['DelKey'].append(key)
    return obj_dict

def write_parquet(obj_dict, delete_source=False):
    """
    
    """
    objs_keys = list(zip(target_obj.get('Body'), target_obj.get('Key'), target_obj.get('DelKey')))
    for obj, key, delkey in objs_keys:
        print(obj)
        df = pd.read_csv(obj)
        table = pa.Table.from_pandas(df)
        print(os.path.join('s3://', key))
        pq.write_to_dataset(
            table=table, 
            root_path=os.path.join('s3://prospect-raw-files', key), 
            filesystem=S3FileSystem()
        ) 
        print(delkey)

        if delete_source:
            s3.delete_object(Bucket='prospect-raw-files', Key=delkey)

# s3: Start with File Convertion to Parquet

In [182]:
s3 = boto3.client('s3',region_name='us-east-1')

### 1. Convert all csv files to parquet files  (delete or move raw files to cold storage)

In [183]:
target_obj = read_objects()
write_parquet(target_obj, delete_source=True)

<botocore.response.StreamingBody object at 0x7f24281b1f28>
s3://bcma/RD-2019-01-01/ClientData
bcma/RD-2019-01-01/ClientData/ClaimDataExample1.csv
<botocore.response.StreamingBody object at 0x7f2428107550>
s3://bcma/RD-2019-01-01/ClientData
bcma/RD-2019-01-01/ClientData/ClaimDataExample2.csv
<botocore.response.StreamingBody object at 0x7f2428107940>
s3://bcma/RD-2019-01-01/ClientData
bcma/RD-2019-01-01/ClientData/ClaimDataExample3.csv
<botocore.response.StreamingBody object at 0x7f2428107e80>
s3://bcma/RD-2019-01-01/ControlFile
bcma/RD-2019-01-01/ControlFile/ControlDataExample.csv
<botocore.response.StreamingBody object at 0x7f2428104400>
s3://bcma/RD-2019-01-01/MembershipData
bcma/RD-2019-01-01/MembershipData/MemberDataExample.csv


# glue: Continue with Ad-Hoc Crawler

In [5]:
glue_client = boto3.client('glue', region_name='us-east-1')

### 1. Creating Ad-Hoc Database
This database is designed to catalogue raw tables related to this client. For this process, if only one table is created, then the data is ready to be transformed to parquet.

In [6]:
glue_client.create_database(
    DatabaseInput={
        'Name': database
    }
)

{'ResponseMetadata': {'RequestId': '55abe334-fcbc-11e9-98e3-ed5f070bbc84',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Fri, 01 Nov 2019 15:29:03 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '55abe334-fcbc-11e9-98e3-ed5f070bbc84'},
  'RetryAttempts': 0}}

### 2. Creating the Ad-Hoc Crawler
This crawler is designed to go over this particular prospect dataset only. Thus, it will catalogue tables only for this dataset.

In [9]:
crawler_name = client_id.lower()+'-raw-crawler_'+received_date
glue_client.create_crawler(
    Name=crawler_name,
    Role='UnderwritingServiceRole',
    DatabaseName=database.lower(),
    Description='Automated Single Usage - Delete after use',
    Targets={
        'S3Targets': [
            {'Path': s3_path}
        ]
    }
)

{'ResponseMetadata': {'RequestId': 'd7728af6-fcbc-11e9-9632-3df273b52ac1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Fri, 01 Nov 2019 15:32:41 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'd7728af6-fcbc-11e9-9632-3df273b52ac1'},
  'RetryAttempts': 0}}

### 3. Running Crawler
Running the crawler will map the data in the selected s3 bucket into catalogue tables.

In [10]:
glue_client.start_crawler(Name=crawler_name)
time.sleep(60)
print("It should be stopping now!")

It should be stopping now!


### 4. Delete Ad-Hoc Crawler

In [11]:
glue_client.delete_crawler(Name=crawler_name)

{'ResponseMetadata': {'RequestId': 'f3effafa-fcbd-11e9-90fb-3500b9f55e40',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Fri, 01 Nov 2019 15:40:38 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'f3effafa-fcbd-11e9-90fb-3500b9f55e40'},
  'RetryAttempts': 0}}

# Continue with Athena Queries

In [3]:
athena_client = boto3.client('athena', region_name='us-east-1')

### 1. Creating claim data query

In [8]:
athena_client.create_named_query(
    Name='ClaimData',
    Database=database,
    QueryString="SELECT * FROM clientdata"
)

ClientError: An error occurred (AccessDeniedException) when calling the CreateNamedQuery operation: User: arn:aws:sts::133469299809:assumed-role/AWSGlueServiceSageMakerNotebookRole-adv_underwriting/SageMaker is not authorized to perform: athena:CreateNamedQuery on resource: arn:aws:athena:us-east-1:133469299809:workgroup/primary