In [None]:
import sqlalchemy
import pandas as pd
import boto3
import os
import psycopg2

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import psycopg2

links
[http://docs.sqlalchemy.org/en/latest/core/selectable.html](sqlalchemy)

[http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html#vpc-security-groups](securitygroups)



# Setting up a RDS

## Contents
<h3>
    <ol>
        <li>Setting up the cluster using Boto</li>
        <li>Connecting to cluster with sqlalchemy</li>
        <li>Loading Data with sqlalchemy</li>
        <li>Testing out various sql needed for next steps</li>
    </ol>
</h3>

In [None]:
# Make sure you have configured your aws credentials in your environment
client = boto3.client('rds')


In [None]:
# check
print(client)

I will demonstrate the construction of the configuration stepwise. In reality you can just use a dict literal as a parameter to create_db_instance.  The api for rds is found here, [rds api](http://docs.aws.amazon.com/AmazonRDS/latest/APIReference/API_CreateDBInstance.html). From This API we can see that create_db_instance is going to need:

    response = client.create_db_instance(
        DBName='string',
        DBInstanceIdentifier='string',
        AllocatedStorage=123,
        DBInstanceClass='string',
        Engine='string',
        MasterUsername='string',
        MasterUserPassword='string',
        DBSecurityGroups=[
            'string',
        ],
        VpcSecurityGroupIds=[
            'string',
        ],
        AvailabilityZone='string',
        DBSubnetGroupName='string',
        PreferredMaintenanceWindow='string',
        DBParameterGroupName='string',
        BackupRetentionPeriod=123,
        PreferredBackupWindow='string',
        Port=123,
        MultiAZ=True|False,
        EngineVersion='string',
        AutoMinorVersionUpgrade=True|False,
        LicenseModel='string',
        Iops=123,
        OptionGroupName='string',
        CharacterSetName='string',
        PubliclyAccessible=True|False,
        Tags=[
            {
                'Key': 'string',
                'Value': 'string'
            },
        ],
        DBClusterIdentifier='string',
        StorageType='string',
        TdeCredentialArn='string',
        TdeCredentialPassword='string',
        StorageEncrypted=True|False,
        KmsKeyId='string',
        CopyTagsToSnapshot=True|False
    )

In [None]:
# step 1, make the db_configuration object
db_configuration = {}


Why are we using a dictionary when the function will take only keyword arguments?  Quick recap on why.

In [None]:
def printKwargs(first, second):
    print(first,second)

aDict = {}
aDict['first'] = 1
aDict['second'] = 2

printKwargs(aDict)

Right, we can't pass a dictionary in to the the function.  However, python lets you unpack your dictionary into tuples!

In [None]:
printKwargs(**aDict)

In [None]:
# back to the actual work, we will sequentially construct the dict for the keyword arguments
# first off let's set the name
db_configuration['DBName'] = 'cfsdb'

In [None]:
# next we need the db instance identifier which is just a way to identify the db
db_configuration['DBInstanceIdentifier'] = 'cfsdb'

In [None]:
# the allocated storage accepts an integer argument. Between 5 and 6144, we'll use 5
db_configuration['AllocatedStorage'] = 5

In [None]:
# Next we need to choose the compute and memory capacity of the instance
# we will choose db.t2.micro as it is free.
db_configuration['DBInstanceClass'] = 'db.t2.micro'

In [None]:
# Next we choose the engine to run.  This is just the flavor of sql to run, we'll choose postgres since a lot 
# of python developers like postgres (this claim is unsubstantiated).
db_configuration['Engine'] = 'postgres'

In [None]:
# Choose the version of the engine. We're using 9.4.1 since it is available in all regions.
db_configuration['EngineVersion'] = '9.4.1'

In [None]:
# next set up the master user name, store it in your environment as AWS_DB_USER or just hard code it in. 8 char minimum
db_configuration['MasterUsername'] = os.getenv('AWS_DB_USER', 'TestUser')

# also setup the password, must have at least 8 characters
db_configuration['MasterUserPassword'] = os.getenv('AWS_DB_PASSWORD', 'TestPassword')


next up is the security groups, I'm hoping you have a default security group ready to go
if not, we'll create one right now
DBSecurityGroups takes as an argument, a list of strings.

In [None]:
# first let's see what security groups exist, use any of these
client.describe_db_security_groups()

In [None]:
client.describe_db_subnet_groups()

In [None]:
# let's make a specific one for this example using the create_db_security_group method of the rds client
# the api is available here, http://docs.aws.amazon.com/AmazonRDS/latest/APIReference/API_CreateDBSecurityGroup.html
client.create_db_security_group(**{
        'DBSecurityGroupDescription': 'Security group for my dataset',
        'DBSecurityGroupName': 'cfs_data_group',
    })

In [None]:
# set the security groups list in the configuration
db_configuration['DBSecurityGroups'] = ['cfs_data_group']

In [None]:
# set the availability zone
db_configuration['AvailabilityZone'] = 'us-west-1c'

In [None]:
# mark the subnet group as default
db_configuration['DBSubnetGroupName'] = 'default'

In [None]:
db_configuration['VpcSecurityGroupIds'] = ['vpc-bba470de']

In [None]:
# we need to provide a window in which to let aws do maintenance.
# we'll just use monday from 06:00-14:00 
db_configuration['PreferredMaintenanceWindow'] = 'Mon:06:00-Mon:10:00'

In [None]:
# We need to provide a limit to how long aws keeps our db backups, values are from 0 to 35
db_configuration['BackupRetentionPeriod'] = 5

In [None]:
# provide the port number to connect to, postgres uses 5432
db_configuration['Port'] = 5432

In [None]:
# tell it that we do not want multiaz deployment
db_configuration['MultiAZ'] = False

In [None]:
    # give the license model for your instance, we're using postgresql-license
db_configuration['LicenseModel'] = 'postgresql-license'

In [None]:
# indicate that we will publicly access the data
db_configuration['PubliclyAccessible'] = True


In [None]:
# indicate the storage type, use standard
db_configuration['StorageType'] = 'standard'

In [None]:
# we do not want to encrypt the storage
db_configuration['StorageEncrypted'] = False

In [None]:
db_configuration

Create the client

In [None]:
created_db = client.create_db_instance(**db_configuration)

Need to delete the vpcsecuritygroupids then.

In [None]:
del db_configuration['DBSecurityGroups']

In [None]:
created_db = client.create_db_instance(**db_configuration)

In [None]:
add, port = client.describe_db_instances()['DBInstances'][0].get('Endpoint').values()

In [None]:
port, add = (5432, 'cfsdb.cs4yyoqmq4gl.us-west-1.rds.amazonaws.com')

## Using sqlalchemy

In [None]:
# creating engine

engine = create_engine('postgresql+psycopg2://{username}:{password}@{add}:{port}/{db_name}'.format(
    username=db_configuration['MasterUsername'],
    password=db_configuration['MasterUserPassword'],
    add=add,
    port=port,
    db_name=db_configuration['DBName']), echo=True)

In [None]:
db_configuration['DBName']

In [None]:
# import sqlalchemy api
from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey


In [None]:
con = engine.connect()

In [None]:
# load data into pandas and then into sqlalchemy connection
transport_df = pd.read_csv("./csv/transport_mode.csv")
transport_df.to_sql('transport_mode', con=engine, if_exists='replace', index=False)

In [None]:
con.execute('SELECT "Mode Description" FROM transport_mode').fetchall()

In [None]:
# read in all our files using the conversion pd-->sql
get_db_name = lambda f: f.split('.')[-2].split('/')[-1].strip('/')
get_db_name('./csv/FIPS_States.csv')

In [None]:
[pd.read_csv(f).to_sql(get_db_name(f), con=engine, if_exists='ignore', index=False) for f in 
 ['./csv/state_latlon.csv', './csv/sctg.csv', './csv/cfs_areas.csv', './csv/FIPS_States.csv', './csv/naics.csv']]

In [None]:
pd.read_csv('./csv/naics.csv').to_sql(get_db_name('./csv/naics.csv'), con=engine, if_exists='fail', index=False)

In [None]:
df = pd.read_csv('./csv/cfs_2012_pumf_first_50k.csv')

In [None]:
#df[:1].to_sql('transactions', con=engine, if_exists='fail', index=False, chunksize=10000)

In [None]:
#pd.read_csv('./cfs_2012_pumf_csv.txt').to_sql('transactions', con=engine, if_exists='fail', index=False, chunksize=10000)

In [None]:
#con.execute('DROP TABLE transactions')

In [None]:
#con.execute('SELECT "SHIPMT_ID" FROM transactions ORDER BY 1 DESC LIMIT 1').fetchall()

In [None]:
#con.execute('SELECT "QUARTER", SUM("WGT_FACTOR") from transactions GROUP BY 1').fetchall()