# Exploring Window Functions with SQL and Pandas
Based on http://www.helenanderson.co.nz/sql-window-functions-part-1/

In [1]:
import numpy as np
import pandas as pd
np.random.seed(20190904)

  (fname, cnt))


In [2]:
import psycopg2
import psycopg2.extras



In [3]:
import os

# Generate sample dataset

In [4]:
N = 1000
NB_YEARS = 1
START_DATE = '2018-01-01'

In [5]:
pd.Series(pd.to_datetime([1,2,3], origin='2018-01-01', unit='D')).dt.strftime('%Y')

0    2018
1    2018
2    2018
dtype: object

In [6]:
def sample_dataset(start_date=START_DATE, nb_years=NB_YEARS, size=N):
    sale_date = pd.Series(
        pd.to_datetime(
            np.random.randint(0, 365 * nb_years, size=size), 
            unit='D', 
            origin=start_date
        )
    )
    ordermonth = sale_date.dt.strftime('%Y-%m')
    territoryid = pd.Series(np.random.randint(0, 10, size))
    subtotal = pd.Series(np.random.uniform(500, 2500, size=size))
    taxamt = 0.075 * subtotal
    freight = 0.025 * subtotal
    totaldue = subtotal + taxamt
    customerid = pd.Series(np.random.randint(1, size//10 + 1, size) + 2000)
    salesorderid = np.arange(4000, 4000 + size)
    df = pd.DataFrame({
        'salesorderid': salesorderid,
        'sale_date': sale_date,
        'ordermonth': ordermonth,
        'territoryid': territoryid,
        'subtotal': subtotal,
        'taxamt': taxamt,
        'freight': freight,
        'totaldue': totaldue,
        'customerid': customerid
    })
    return df

In [7]:
dataset = sample_dataset()
dataset.head()

Unnamed: 0,salesorderid,sale_date,ordermonth,territoryid,subtotal,taxamt,freight,totaldue,customerid
0,4000,2018-10-15,2018-10,2,1867.940043,140.095503,46.698501,2008.035546,2079
1,4001,2018-08-27,2018-08,7,1037.805109,77.835383,25.945128,1115.640492,2058
2,4002,2018-08-19,2018-08,3,2318.164763,173.862357,57.954119,2492.02712,2016
3,4003,2018-08-30,2018-08,1,2104.229867,157.81724,52.605747,2262.047107,2042
4,4004,2018-03-25,2018-03,6,2370.179348,177.763451,59.254484,2547.942799,2067


In [8]:
dataset.to_csv('sales.csv', index=False)

In [9]:
pwd

'/Users/sinayoks/dev/scratch-sql/sql'

In [10]:
!head sales.csv

salesorderid,sale_date,ordermonth,territoryid,subtotal,taxamt,freight,totaldue,customerid
4000,2018-10-15,2018-10,2,1867.9400426683374,140.0955032001253,46.69850106670844,2008.0355458684626,2079
4001,2018-08-27,2018-08,7,1037.8051087290623,77.83538315467968,25.94512771822656,1115.640491883742,2058
4002,2018-08-19,2018-08,3,2318.1647625406504,173.86235719054878,57.95411906351626,2492.027119731199,2016
4003,2018-08-30,2018-08,1,2104.22986723532,157.817240042649,52.605746680883,2262.0471072779687,2042
4004,2018-03-25,2018-03,6,2370.179347712942,177.76345107847067,59.254483692823555,2547.9427987914128,2067
4005,2018-05-09,2018-05,8,1078.934365009996,80.9200773757497,26.973359125249903,1159.8544423857459,2055
4006,2018-07-15,2018-07,5,900.3227133346401,67.524203500098,22.508067833366002,967.8469168347381,2057
4007,2018-05-28,2018-05,5,1720.0294820306863,129.00221115230147,43.00073705076716,1849.0316931829877,2050
4008,2018-11-16,2018-11,7,994.5347728977076,74.59010796732807,24.8633693224426

# create postgres database

In [11]:
def connect_db(dbname, user=None, **kwargs):
    user = user or 'postgres'
    con = psycopg2.connect(database=dbname, user=user, **kwargs)
    con.autocommit = True
    return con

In [12]:
def create_db(dbname='sales', drop=True):
    con = connect_db('postgres')
    cur = con.cursor()
    cur.execute(f'DROP DATABASE IF EXISTS {dbname};')
    cur.execute(f'DROP USER IF EXISTS {dbname};')
    cur.execute(f'CREATE DATABASE {dbname};')
    cur.execute(f'CREATE USER {dbname};')
    con.close()

def create_table(dbname='sales', filename='sales.csv'):
    """Import data from csv into postgres database.
    
    Uses psycopg2 and sql COPY. 
    Could also use pandas.to_sql but this removes dependency on pandas and is also more scaleable.
    """
    filename = os.path.abspath(filename)
    assert os.path.exists(filename), f'csv file {filname} not found'
    con = connect_db(dbname)
    try:
        with con.cursor() as cur:
            cur.execute(f'DROP TABLE IF EXISTS {dbname};')
            cur.execute(f"""
    CREATE TABLE {dbname} (
    salesorderid INTEGER UNIQUE NOT NULL,
    sale_date DATE NOT NULL,
    ordermonth VARCHAR (7) NOT NULL,
    territoryid FLOAT NOT NULL,
    subtotal FLOAT NOT NULL,
    taxamt FLOAT NOT NULL,
    freight FLOAT NOT NULL,
    totaldue FLOAT NOT NULL,
    customerid INTEGER NOT NULL
    );""")
        with con.cursor() as cur:
            cur.execute(
            f"COPY {dbname} FROM '{filename}' WITH (FORMAT csv, HEADER true);"
            )
            cur.execute(f'GRANT SELECT ON {dbname} TO {dbname};')
    finally:
        con.close()

def query(query, dbname='sales', columns=dataset.columns):
    con = connect_db(dbname, user=dbname)
    with con.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) as cur:
        cur.execute(query)
        return cur.fetchall()

def queryframe(*args, **kwargs):
    data = query(*args, **kwargs)
    return pd.DataFrame(data)
    
    

In [13]:
create_db()

In [14]:
create_table()

In [15]:
query('select * from sales limit 2')

[Record(salesorderid=4000, sale_date=datetime.date(2018, 10, 15), ordermonth='2018-10', territoryid=2.0, subtotal=1867.94004266834, taxamt=140.095503200125, freight=46.6985010667084, totaldue=2008.03554586846, customerid=2079),
 Record(salesorderid=4001, sale_date=datetime.date(2018, 8, 27), ordermonth='2018-08', territoryid=7.0, subtotal=1037.80510872906, taxamt=77.8353831546797, freight=25.9451277182266, totaldue=1115.64049188374, customerid=2058)]

In [16]:
queryframe('select * from sales limit 2')

Unnamed: 0,salesorderid,sale_date,ordermonth,territoryid,subtotal,taxamt,freight,totaldue,customerid
0,4000,2018-10-15,2018-10,2.0,1867.940043,140.095503,46.698501,2008.035546,2079
1,4001,2018-08-27,2018-08,7.0,1037.805109,77.835383,25.945128,1115.640492,2058


# Running Aggregations
Running sum, min, max or average by day. 


In [18]:
def running_aggs_with_sql():
    running_agg_query = """
-- running total	
select
  sale_date,
  salesorderid,
  subtotal,
  sum(subtotal) 
  over(
  	partition by sale_date 
	order by salesorderid
  ) as total_sales,
  min(subtotal)
  over(
    partition by sale_date 
    order by salesorderid 
  ) min_sales, 
  avg(subtotal)
  over(
    partition by sale_date 
    order by salesorderid 
  ) avg_sales
from 
  sales
where 
  sale_date between 
'2018-01-01' and '2018-12-31'
order by sale_date;
    """
    res = queryframe(running_agg_query)
    return res

In [38]:
aggs_sql = running_aggs_with_sql()
aggs_sql

Unnamed: 0,sale_date,salesorderid,subtotal,total_sales,min_sales,avg_sales
0,2018-01-01,4065,1551.147359,1551.147359,1551.147359,1551.147359
1,2018-01-01,4572,1622.415676,3173.563035,1551.147359,1586.781517
2,2018-01-01,4625,1693.465470,4867.028504,1551.147359,1622.342835
3,2018-01-02,4131,1658.014062,1658.014062,1658.014062,1658.014062
4,2018-01-02,4157,623.691745,2281.705807,623.691745,1140.852904
...,...,...,...,...,...,...
995,2018-12-30,4883,2497.886501,8096.963085,742.371161,1619.392617
996,2018-12-31,4301,1118.252026,1118.252026,1118.252026,1118.252026
997,2018-12-31,4434,1984.526544,3102.778571,1118.252026,1551.389285
998,2018-12-31,4507,1625.339854,4728.118425,1118.252026,1576.039475


In [154]:
aggs_pandas = dataset[['sale_date', 'salesorderid', 'subtotal']].assign(
    total_sales=dataset.groupby(['sale_date'])['subtotal'].cumsum(),
    min_sales=dataset.groupby(['sale_date'])['subtotal'].cummin(),
    avg_sales=dataset.sort_values(['sale_date', 'salesorderid']).groupby(['sale_date'])['subtotal'].expanding().mean().reset_index(0)['subtotal'],
).sort_values(
    ['sale_date', 'salesorderid']
).reset_index(
    drop=True
).astype({'sale_date':'str'})
aggs_pandas

Unnamed: 0,sale_date,salesorderid,subtotal,total_sales,min_sales,avg_sales
0,2018-01-01,4065,1551.147359,1551.147359,1551.147359,1551.147359
1,2018-01-01,4572,1622.415676,3173.563035,1551.147359,1586.781517
2,2018-01-01,4625,1693.465470,4867.028504,1551.147359,1622.342835
3,2018-01-02,4131,1658.014062,1658.014062,1658.014062,1658.014062
4,2018-01-02,4157,623.691745,2281.705807,623.691745,1140.852904
...,...,...,...,...,...,...
995,2018-12-30,4883,2497.886501,8096.963085,742.371161,1619.392617
996,2018-12-31,4301,1118.252026,1118.252026,1118.252026,1118.252026
997,2018-12-31,4434,1984.526544,3102.778571,1118.252026,1551.389285
998,2018-12-31,4507,1625.339854,4728.118425,1118.252026,1576.039475


In [155]:
for col in ['total_sales', 'min_sales', 'avg_sales']:
    pd.util.testing.assert_almost_equal(
        aggs_sql[col], 
        aggs_pandas[col], 
        check_less_precise=False
    )