### Imports 

In [None]:
import boto3
from datetime import datetime as dt
import os
import pandas as pd
import time

# Set ClientId

In [None]:
client_id = 'BCMA'
received_date = '2019-01-01'

client_id = client_id.lower()
received_date = 'RD-'+pd.to_datetime(received_date).strftime("%Y-%m-%d")
database = client_id+'_'+received_date

# Starting with S3

In [None]:
session = boto3.Session(region_name='us-east-1')
s3_resource = session.resource('s3')
s3_client = s3_resource.meta.client

###  1. Creating Raw Folder Structure

This folder structure is ...

In [None]:
def create_s3_folder_structure(received_date, folder_list):
    for folder in folder_list:
        received_date_key = received_date
        complete_structure = os.path.join(client_id, received_date_key, folder, '')
        s3_client.put_object(Bucket='prospect-raw-files', Key=complete_structure)

In [None]:
folders = ['ClaimtData', 'MembershipData', 'ControlData']
s3_path = os.path.join('s3://prospect-raw-files/', client_id, received_date)
create_s3_folder_structure(received_date, folders)

# UPLOAD THE FILES TO S3

---
# Continue with Glue Crawler

In [None]:
glue_client = boto3.client('glue', region_name='us-east-1')

### 1. Creating Database

In [None]:
glue_client.create_database(
    DatabaseInput={
        'Name': database
    }
)

### 2. Creating Crawler

In [None]:
crawler_name = client_id.lower()+'-raw-crawler_'+received_date
glue_client.create_crawler(
    Name=crawler_name,
    Role='UnderwritingServiceRole',
    DatabaseName=database.lower(),
    Description='Automated Single Usage - Delete after use',
    Targets={
        'S3Targets': [
            {'Path': s3_path}
        ]
    }
)

### 3. Running Crawler

In [None]:
glue_client.start_crawler(Name=crawler_name)
time.sleep(60)
print("It should be stopping now!")

### 4. Delete Crawler

In [None]:
glue_client.delete_crawler(Name=crawler_name)

---
# Continue with Glue Jobs in Spark Clusters

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

### 1. Getting Catalogue Content

In [None]:
claim_data = glueContext.create_dynamic_frame.from_catalog(database=database, 
                                                           table_name='clientdata')
member_data = glueContext.create_dynamic_frame.from_catalog(database=database, 
                                                            table_name='membershipdata')
control_data = glueContext.create_dynamic_frame.from_catalog(database=database, 
                                                             table_name='controlfile')

### 2. Exploring Schema

In [None]:
claim_data.printSchema()

In [None]:
member_data.printSchema()

In [None]:
control_data.printSchema()

### Row Count

In [None]:
print(f'Claim Rows: {claim_data.count():2,.0f}', 
      f'Member Rows: {member_data.count():2,.0f}', sep='\n')

In [None]:
control_data.toDF().show()

## Membership

### Schema

### Row Count

## Control

### Schema

### Row Count