# SQL Querying

This notebook can be used to query tables in the Congressional Data database. In order to use this notebook, you will need to set an environment variable 'CD_DWH' to the database connection string. If you do not have the credentials, please slack us at #datasci-congressdata channel and/or talk to a project lead.

**It is best practice to not hard code database URI strings directly in notebooks or code as when we push to Github, that would mean credentials are public for anyone to see.**

In [None]:
import os
import sys
import math

import pandas as pd
pd.options.display.max_columns = 999
import sqlalchemy as sqla
from sqlalchemy import create_engine

from plotnine import *
import plotly
import urllib, json

DB_URI = os.getenv('CD_DWH')
engine = create_engine(DB_URI)

In [None]:
# Checking that the Kernel is using the Conda environment datasci-congressional-data
# Below you should see something like '/Users/Username/anaconda3/envs/datasci-congressional-data/bin/python
# If you do NOT see "datasci-congressional-data" this means you are not in the right Python Environment
# Please make sure you have gone through the onboarding docs and/or talk to a project lead.
sys.executable

Below are the tables that currently exist in the database!

## Query table & visualize frequency plots

In [None]:
QUERRY = """
SELECT
    *
  FROM trg_analytics.candidate_contributions """
with engine.begin() as conn:
    results = pd.read_sql(QUERRY, conn)

In [None]:
#results.to_csv("trg_analytics-candidate_contrib.csv")
#results.head()
results['transaction_amount'].describe()

In [None]:
transactions = results['transaction_amount']
ninety_ninth_quantile = math.floor(transactions.quantile(0.99))
contribs = transactions[transactions.between(0, ninety_ninth_quantile, inclusive=False)]
contribs.describe()

In [None]:
contribs.hist(bins=50)

In [None]:
candidate_contrib = results.groupby('recipient_candidate_office').agg({'transaction_amount': ['sum']})
candidate_contrib_vals = candidate_contrib['transaction_amount', 'sum']
candidate_contrib_vals.sort_values().plot.barh()

(ggplot(data = results)
  + geom_freqpoly(aes(x=results['transaction_date']
                      , color=results['recipient_candidate_office'])
                      , binwidth=2000.0)
)

In [None]:
st_assem_results = results[(results['recipient_candidate_office']=='State Assembly') &
                            (results['election_cycle']=='2015')]
st_assem_candid = st_assem_results.groupby('recipient_candidate_name').agg({'transaction_amount': ['sum']})
st_assem_candid_vals = st_assem_candid['transaction_amount','sum']
st_assem_candid_vals.sort_values(ascending=False)[:30].plot.bar()

In [None]:
donor_contrib = st_assem_results.groupby('donor_organization').agg({'transaction_amount': ['sum']})
donor_contrib_vals = donor_contrib['transaction_amount', 'sum']
donor_contrib_vals.sort_values(ascending=False).plot.bar()

In [None]:
oth_results = st_assem_results[results['donor_organization']=='OTH']
oth_pac = oth_results[oth_results['donor_name'].str.contains('PAC')]
(len(oth_pac) / len(oth_results))

In [None]:
#len(oth_results[oth_results['donor_industry']!='0']) / len(oth_results)
oth_results.head()

In [None]:
scc_results = st_assem_results[results['donor_organization']=='SCC']
scc_pac = scc_results[scc_results['donor_name'].str.contains('PAC')]
(len(scc_pac) / len(scc_results))

In [None]:
rcp_results = st_assem_results[results['donor_organization']=='RCP']
rcp_pac = rcp_results[rcp_results['donor_name'].str.contains('PAC')]
(len(rcp_pac) / len(rcp_results))

In [None]:
results['election_cycle'].unique()

In [None]:
contrib_flow = results[[
    'election_cycle',
    'transaction_amount',
    'donor_zip_code',
    'recipient_candidate_district'
]].\
groupby(['election_cycle',
         'donor_zip_code',
         'recipient_candidate_district',])

In [None]:
#contrib_flow.groups
contrib_flow.get_group(('2013', '95814-3963', '14'))['transaction_amount']

### For instructions on setting up Plotly for producing charts see the following link
https://plot.ly/python/getting-started/

I'm using credentials stored in `~/.plotly/.credentials`

#### For an explanation of the chart below see the following links
https://plot.ly/~alishobeiri/1257/plotly-sankey-diagrams/

#### Data for the diagram can be found here.
https://github.com/plotly/dash-app-datasets/blob/master/scottish-votes.csv

**Note:** this is temporary data-set from the example as a placeholder

In [None]:
scottish_df = pd.read_csv('scottish-votes.csv')

data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  scottish_df['Node, Label'].dropna(axis=0, how='any'),
      color = scottish_df['Color']
    ),
    link = dict(
      source = scottish_df['Source'].dropna(axis=0, how='any'),
      target = scottish_df['Target'].dropna(axis=0, how='any'),
      value = scottish_df['Value'].dropna(axis=0, how='any'),
      color = scottish_df['Link Color'].dropna(axis=0, how='any'),
  )
)

layout =  dict(
    title = "Campaign Contributions from Donor Zip Code to Candidate District",
    height = 772,
    width = 950,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)
plotly.plotly.iplot(fig, validate=False)