In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Data Ingestion From External AWS Sources using Python

## Setup

You need [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#installation) library to interact with AWS using python.

Install boto3 using the following command :

In [None]:
# pip install boto3

You also need to have your IAM credentials setup using the **aws configure** command

Inorder to interact with AWS sources, you need to create a client/resource object for that service. (Services are S3, kinesis, emr - services specified in the aws console).

[List of services](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/index.html) can be found here in boto3 documentation.

In [None]:
import boto3
# s3_client = boto3.client("<service name here>")
# s3_client = boto3.client("<service name here>")

In [None]:
# use this to pretty print json/dictionary object in pytho
from pprint import pprint
import boto3
import pandas as pd
import random

**All responses from clients are low level response of python dict/json type or None.**

# S3

All methods for S3 buckets are listed [here](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html)

* List buckets for an account
* List objects in a bucket
* Download a file from Bucket
* Upload file to bucket

### Create Client

In [None]:
s3_client = boto3.client(service_name = 's3')

### List all s3 buckets

In [None]:
## NAME key is the bucket name
result = s3_client.list_buckets()
pprint(result)

### List all objects in s3 bucket

In [None]:
## KEY are the file names
result = s3_client.list_objects(Bucket = "datasparksahil")
pprint(result)

### Download file from s3

In [None]:
# key = Name of the file after uploading to s3
# Filename = local file name
response = s3_client.download_file(Bucket='datasparksahil', Key='pipeline_demo.py', Filename="/home/sahil/Desktop/second_script.py")
pprint(response)

### Upload object to s3

In [None]:
# key = Name of the file after uploading to s3
# Filename = local file name
response = s3_client.upload_file(Filename="./mydog.jpg", Key="mydog.jpg", Bucket="datasparksahil")
pprint(response)

# Dynamo DB

The code is referenced from [here](https://sysadmins.co.za/interfacing-amazon-dynamodb-with-python-using-boto3/)

The complete list of avaliable methods can be found [here](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html)

### Create client/resource

In [None]:
dynamodb_client = boto3.client("dynamodb")
# dynamodb_resource = boto3.resource("dynamodb")

### List all tables

In [None]:
dynamodb_client.list_tables()

### Create table 

In [None]:
# specify parameters
params = {
    'TableName' : "taxi_fleet_dynamodb",
    'KeySchema': [       
        { 'AttributeName': "car_id", 'KeyType': "HASH"},    # Partition key
    ],
    'AttributeDefinitions': [       
        { 'AttributeName': "car_id", 'AttributeType': "N" }, # column type for partition key - N means number
    ],
    'ProvisionedThroughput': {        
        'ReadCapacityUnits': 5, 
        'WriteCapacityUnits': 5
    }
}
response = dynamodb_client.create_table(**params)
pprint(response)

### Describe Table

It takes a couple of minutes to provision a table after you have created it. You can see the status with the **describe_table()** command

In [None]:
response = dynamodb_client.describe_table(TableName='taxi_fleet_dynamodb')
pprint(response)

### Put Item : Using Client (Longer Method)

In [None]:
# put values using client - little bit long
response = dynamodb_client.put_item(TableName="taxi_fleet_dynamodb",
                                    Item = {'car_id':       {"N":str(random.randint(1,5))}, 
                                            'pick_up_time': {"S":str(pd.to_datetime('now'))},
                                            'fuel_level':   {"N":str(random.randint(1,100))},
                                            'car_health':   {"N":str(random.randint(0,100))},
                                            'warning':      {"N":str(random.randint(0,10))},
                                           }
                                   )
pprint(response)

### Put Item : Using Resource (Easier Method)

In [None]:
# put item using using primary and sort key (if sort key is set).
# if the item already exist then its value will be updated

# declare table
dynamo_db_table = boto3.resource('dynamodb').Table("taxi_fleet_dynamodb")
# put items
response = dynamo_db_table.put_item(Item = {'car_id':random.randint(1,5), 
                                            'pick_up_time': str(pd.to_datetime('now')),
                                            'fuel_level':random.randint(1,100),
                                            'car_health':random.randint(0,100),
                                            'warning':random.randint(0,10),
                                           }
                                   )
pprint(response)

### Put Item - Multiple in a loop

In [None]:
# put multiple items - only 5 items per second(WriteCapacityUnits)
import time

# declare table
dynamo_db_table = boto3.resource('dynamodb').Table("taxi_fleet_dynamodb")
# put items
for i in range(20):
    response = dynamo_db_table.put_item(Item = {'car_id':random.randint(1,5), 
                                                'pick_up_time': str(pd.to_datetime('now')),
                                                'fuel_level':random.randint(1,100),
                                                'car_health':random.randint(0,100),
                                                'warning':random.randint(0,10),
                                               }
                                       )
    time.sleep(0.3)

### BatchWrite - TODO

### Delete item : Provide both Primary Key and Sort Key(if sort key is set)

In [None]:
dynamo_db_table.delete_item(
        Key={
        'car_id': 1
        }
)

### Query Table : Based on Primary Key and Sort Key

A query operation searches only primary key attribute values and supports a subset of comparison operators on key attribute values to refine the search process.

In [None]:
from boto3.dynamodb.conditions import Key, Attr
response = dynamo_db_table.query(
    KeyConditionExpression=Key('car_id').eq(4) 
)

pprint(response)

### Scan Based on Non-Key Items

**AVOID USING SCANS** 

A scan operation scans the entire table. You can specify filters to apply to the results to refine the values returned to you, after the complete scan.

In [None]:
response = dynamo_db_table.scan(
    FilterExpression=Attr('car_health').eq(61)
)
pprint(response)

### Difference between Scan and Query

[what-is-the-difference-between-scan-and-query-in-dynamodb-when-use-scan-query](https://stackoverflow.com/questions/43452219/what-is-the-difference-between-scan-and-query-in-dynamodb-when-use-scan-query)

### Deleting Table

In [None]:
response = dynamodb_client.delete_table(TableName="hello_world_sahil")
pprint(response)

# Postgres RDS

Boto3 is used inly for creating/configuring/destroying the EC2 instances for the RDS.
For all other purposes, we use other libraries mentioned below 

Option 1 : To connect with any AWS RDS, you need to install **SQLAlchemy**

Option 2 : Use only **psycopg2** to connect to postgres only

I prefer option 1(SQLAlchemy), since you can use this method for connecting to follwoing databases,
otherwise you will have to learn libraries for various databases themselves.
* PostgreSQL
* MySQL
* SQLite
* Oracle
* Microsoft SQL Server
* Firebird
* Sybase


In [None]:
pip install sqlalchemy

### Create Client/Resource

In [None]:
rds_client = boto3.client("rds")

### Create database (TODO)

In [None]:
# rds_client.create_db_instance(DBInstanceIdentifier="demo_postgres_db", # unique name for ur AWS account and region
#                                AllocatedStorage=20,                    # storage capacity 20GB
#                                DBName='taxi_fleet_db',                 # name of database to create after insitnace has been initialized, if none then 'postgres' will be DBNAME
#                                Engine='postgres',                      # type of database
#                                MultiAZ=False,
#                                MasterUsername='sahil',
#                                MasterUserPassword='charlie_bravo',
#                                DBInstanceClass='db.t2.micro')

### Connect to database

We will use sqlalchemy to execurte a database

In [None]:
from sqlalchemy import create_engine

In [None]:
POSTGRES_USERNAME = "username"
POSTGRES_PASSWORD = "password"
POSTGRES_DBNAME = "postgres"
POSTGRES_HOST = "ip of machine"

url = 'postgresql://{}:{}@{}:{}/{}'.format(POSTGRES_USERNAME, POSTGRES_PASSWORD, POSTGRES_HOST, 5432, POSTGRES_DBNAME)
print(url)
engine = create_engine(url)

### List all table names

In [None]:
engine.table_names()

### Create table

In [None]:
result = engine.execute("CREATE TABLE IF NOT EXISTS films (title text, director text, year text);")

### Insert into table

In [None]:
engine.execute("INSERT INTO films (title, director, year) VALUES ('Doctor Strange', 'Scott Derrickson', '2016');")
engine.execute("INSERT INTO films (title, director, year) VALUES ('Titanic', 'Richard Brown', '2010');")

### Read from table

In [None]:
result_set = engine.execute("SELECT * FROM films")  
for r in result_set:  
    print(r)

### Read from table - into a pandas dataframe

In [None]:
import pandas as pd
df = pd.read_sql_query("select * from films;", con=engine)
df.head()

### Update values in table

In [None]:
engine.execute("UPDATE films SET title='Some2016Film' WHERE year='2016'")

### Delete values from table

In [None]:
engine.execute("DELETE FROM films WHERE year='2016'")  

## MySQL

We need **sqlalchmey** library to interact with MySQL

In [None]:
# !pip install sqlalchemy

In [None]:
from sqlalchemy import create_engine

### Create connection to mysql

In [None]:
USERNAME = ""
PASSWORD = ""
DBNAME = ""
HOST = ""

url = 'mysql+pymysql://{}:{}@{}:{}/{}'.format(USERNAME, PASSWORD, HOST, 3306, DBNAME)
print(url)
engine = create_engine(url)

### List all tables

In [None]:
engine.table_names()

### Create Table

In [None]:
result = engine.execute("CREATE TABLE IF NOT EXISTS films (title text, director text, year text);")
print(result)

### Insert into table

In [None]:
engine.execute("INSERT INTO films (title, director, year) VALUES ('Doctor Strange', 'Scott Derrickson', '2016');")
engine.execute("INSERT INTO films (title, director, year) VALUES ('Titanic', 'Richard Brown', '2010');")

### Show values from table

In [None]:
result_set = engine.execute("SELECT * FROM films")  
for r in result_set:  
    print(r)

### Read from table - into a pandas dataframe

In [None]:
import pandas as pd
df = pd.read_sql_query("select * from films;", con=engine)
df.head()

### Update values in table

In [None]:
engine.execute("UPDATE films SET title='Doctor Strange 2' WHERE year='2016'")

### Delete values from table

In [None]:
engine.execute("DELETE FROM films WHERE year='2016'")  

### Delete Table

In [None]:
engine.execute("DROP TABLE films;")

## Microsoft SQL Server (TODO)

We need **pyodbc** library to interact with SQL Server

In [None]:
# !pip3 install pyodbc

In [None]:
# import sys
# sys.path.insert(0, "/usr/local/lib/python3.5/")
import pyodbc

In [None]:
import pyodbc 
cnxn = pyodbc.connect("Driver={SQL Server};"
                        "Server=sqlserverdemodb.cznthudneeub.eu-west-1.rds.amazonaws.com"
                        "Database=master"
                        "uid=sahil;pwd=abcdef1234")
# df = pd.read_sql_query('select * from table', cnxn)

In [None]:
cursor = cnxn.cursor()
cursor.execute('SELECT * FROM Table')

for row in cursor:
    print('row = %r' % (row,))