In [None]:
# this demo requires existence of table "stage_expenses_cleaned" created in "demo_SQL_DataCleanup"
import pandas as pd
import sqlite3
import os
import matplotlib.pyplot as plt

# create the sqlite DB
db = sqlite3.connect(":memory:")

# read CSV into dataframe
df1 = pd.read_csv("Data/stage_expenses_cleaned.csv", header="infer")

# convert the pandas dataframe to a sqlite table
df1.to_sql("stage_expenses_cleaned", db, if_exists="replace")

def run_query(query):
    return pd.read_sql_query(query,db)

os.getcwd()

In [None]:
# this handy pandas method displays a dataframe's aggregate metadata
# which also conveniently shows you basic aggregations (count, mean, min ... )
df1.describe()

In [None]:
# query to show us what tables exist in DB
query = """
SELECT * 
FROM sqlite_master
ORDER BY name;
"""
run_query(query)

In [None]:
# get information about my staging table 
query_table_help = """
select * from PRAGMA_TABLE_INFO('stage_expenses_cleaned');
"""
run_query(query_table_help)

In [None]:
query = """
SELECT * 
FROM stage_expenses_cleaned;
"""
run_query(query)

In [None]:
# create key metrics by US state
query_state_KM = """
select 
 Vendor_State
,sum(Vendor_Amount) as "Sum Amount"
,avg(Vendor_Amount) as "Avg Amount"
,count(*) as "Trans Count"
from stage_expenses_cleaned
group by Vendor_State

union all

select 
 '(Total)'
,sum(Vendor_Amount) as "Sum Amount"
,avg(Vendor_Amount) as "Avg Amount"
,count(*) as "Trans Count"
from stage_expenses_cleaned

order by "Sum Amount" desc;
"""

run_query(query_state_KM)


In [None]:
# create key metrics by Category
query_category_KM = """
select 
 Vendor_Category
,sum(Vendor_Amount) as "Sum Amount"
,avg(Vendor_Amount) as "Avg Amount"
,count(*) as "Trans Count"
from stage_expenses_cleaned
group by Vendor_Category

union all

select 
 '(Total)'
,sum(Vendor_Amount) 
,avg(Vendor_Amount) 
,count(*) 
from stage_expenses_cleaned

order by "Sum Amount" desc;
"""

run_query(query_category_KM)

In [None]:
# create state count for map
query_st_map = """
select 
 Vendor_State as "VendorState"
,count(*) as "TransCount"
from stage_expenses_cleaned
group by Vendor_State
order by "VendorState" asc;
"""

df2 = run_query(query_st_map)

# we can also re-sort the dataframe
df2 = df2.sort_values("TransCount", ascending = False).reset_index(drop=True)


df2

In [None]:
## Plotting by Shane -- 
## requires manual install of plotly library
## "conda install -c plotly plotly"
## https://anaconda.org/plotly/plotly

#from plotly.express import d3_plot
#df.rename(columns={'Vendor_State': 'Vendor State'}, inplace=True)
#d3_bar(df, x='Vendor State', y='Trans Count', color='Vendor State', title='Stage Expenses Cleaned')

In [None]:
# we add the %matplotlib inline magic command to see our plot inside the jupyter notebook. 
# If you build the plot in a .py file, then remove the %matplotlib inline command as it will raise an error
%matplotlib inline

my_plot = df2.plot(x="VendorState", y="TransCount" ,kind="bar",legend=None,title="Transaction Count by State" ,color = "green")
my_plot.set_xlabel("States")
my_plot.set_ylabel("Count")

plt.show(my_plot)
