## Monthly sales report by creating analytics tables (SQL)!


In [1]:
import pandas as pd
import numpy as np

import sqlalchemy as sq
from sqlalchemy.orm import Session
from sqlalchemy_utils import database_exists, create_database

import csv
from io import StringIO

In [2]:
#connect to postgres db
engine = sq.create_engine('postgresql://jupyterdb_user:jupyterdb_user_123@db:5432/analytics_tables')
if not database_exists(engine.url):
    create_database(engine.url)

print(database_exists(engine.url), engine.url)

OperationalError: (psycopg2.OperationalError) could not translate host name "db" to address: nodename nor servname provided, or not known

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
insp = sq.inspect(engine)
schemas = insp.get_schema_names()
for schema in schemas:
    print("schema: ", schema)
    for table_name in insp.get_table_names(schema=schema):
        print("table_name: ",table_name)
        for column in insp.get_columns(table_name, schema=schema):
            print("Column: ", column)

schema:  information_schema
table_name:  sql_parts
Column:  {'name': 'feature_id', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
Column:  {'name': 'feature_name', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
Column:  {'name': 'is_supported', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
Column:  {'name': 'is_verified_by', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
Column:  {'name': 'comments', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
table_name:  sql_implementation_info
Column:  {'name': 'implementation_info_id', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment': None}
Column:  {'name': 'implementation_info_name', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': False, 'comment

### Define functions

In [80]:
def select(sql: str, engine = engine) -> pd.DataFrame:
    with engine.connect() as connection:
        df_sql = pd.DataFrame(connection.execute(sq.text(sql)))
    return df_sql

In [6]:
# from https://stackoverflow.com/questions/23103962/how-to-write-dataframe-to-postgres-table

def psql_insert_copy(table, conn, keys, data_iter):
    # gets a DBAPI connection that can provide a cursor
    dbapi_conn = conn.connection
    with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)

        columns = ', '.join('"{}"'.format(k) for k in keys)
        if table.schema:
            table_name = '{}.{}'.format(table.schema, table.name)
        else:
            table_name = table.name

        sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
            table_name, columns)
        cur.copy_expert(sql=sql, file=s_buf)

### Prepare Data from csv

In [7]:
df = pd.read_csv('data/data.csv', encoding= 'cp1252' , header=0)

In [8]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [9]:
df.shape

(541909, 8)

In [10]:
df.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object

In [11]:
df.columns.str.lower()

Index(['invoiceno', 'stockcode', 'description', 'quantity', 'invoicedate',
       'unitprice', 'customerid', 'country'],
      dtype='object')

In [12]:
df.columns = ['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date',
       'unitprice', 'customer_id', 'country']

In [13]:
df.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unitprice,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [14]:
df['invoice_date'] = pd.to_datetime(df['invoice_date'], format = '%m/%d/%Y %H:%M')

In [15]:
df.dtypes

invoice_no              object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unitprice              float64
customer_id            float64
country                 object
dtype: object

### Create table in postgres and upload data

In [16]:
df.to_sql('ecommerce', 
          engine, index=False,
          if_exists='replace',
          method=psql_insert_copy)

In [17]:
sql = ''' select * from ecommerce t limit 10 '''

In [18]:
select(sql, engine)

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unitprice,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [19]:
#cгенерируем диапазон дат
sql = ''' select
t.invoice_date,
date_trunc('month', t.invoice_date) as month
 from ecommerce t
 limit 100
'''

In [20]:
select(sql,engine)

Unnamed: 0,invoice_date,month
0,2010-12-01 08:26:00,2010-12-01
1,2010-12-01 08:26:00,2010-12-01
2,2010-12-01 08:26:00,2010-12-01
3,2010-12-01 08:26:00,2010-12-01
4,2010-12-01 08:26:00,2010-12-01
...,...,...
95,2010-12-01 09:37:00,2010-12-01
96,2010-12-01 09:37:00,2010-12-01
97,2010-12-01 09:37:00,2010-12-01
98,2010-12-01 09:37:00,2010-12-01


In [41]:
#узнаем период по датам
sql = ''' select
min(date_trunc('month', t.invoice_date)) as min_month,
max(date_trunc('month', t.invoice_date)) as max_month
from ecommerce t
'''

In [42]:
select(sql,engine)

Unnamed: 0,min_month,max_month
0,2010-12-01,2011-12-01


In [122]:
# генерируем месяцы по периоду
sql = ''' SELECT date_trunc('day', dd):: date as month
FROM generate_series
        (( select
min(date_trunc('month', t.invoice_date)) as min_month
from ecommerce t)
        , ( select
max(date_trunc('month', t.invoice_date)) as max_month
from ecommerce t)
        , '1 day'::interval) dd
'''

In [123]:
select(sql,engine)

Unnamed: 0,month
0,2010-12-01
1,2010-12-02
2,2010-12-03
3,2010-12-04
4,2010-12-05
...,...
361,2011-11-27
362,2011-11-28
363,2011-11-29
364,2011-11-30


### Create table user

In [113]:
sql = ''' select

t.*,
case when t.customer_id is null then -1 else t.customer_id end as customer_id,
date_trunc('month', t.invoice_date) as month

from ecommerce t
limit 100

'''

In [114]:
select(sql,engine)

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unitprice,customer_id,country,customer_id.1,month
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,17850.0,2010-12-01
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,17850.0,2010-12-01
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,17850.0,2010-12-01
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,17850.0,2010-12-01
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,17850.0,2010-12-01
...,...,...,...,...,...,...,...,...,...,...
95,536378,21559,STRAWBERRY LUNCH BOX WITH CUTLERY,6,2010-12-01 09:37:00,2.55,14688.0,United Kingdom,14688.0,2010-12-01
96,536378,22352,LUNCH BOX WITH CUTLERY RETROSPOT,6,2010-12-01 09:37:00,2.55,14688.0,United Kingdom,14688.0,2010-12-01
97,536378,21212,PACK OF 72 RETROSPOT CAKE CASES,120,2010-12-01 09:37:00,0.42,14688.0,United Kingdom,14688.0,2010-12-01
98,536378,21975,PACK OF 60 DINOSAUR CAKE CASES,24,2010-12-01 09:37:00,0.55,14688.0,United Kingdom,14688.0,2010-12-01


In [115]:
#на каждого пользователя найти минимальную дату и месяц, гроупбай
sql = ''' select

case when t.customer_id is null then -1 else t.customer_id end as customer_id,
min(t.invoice_date) as reg_date,
min(date_trunc('month', t.invoice_date)) as reg_month

from ecommerce t
group by
case when t.customer_id is null then -1 else t.customer_id end

limit 100
'''

In [116]:
select(sql)

Unnamed: 0,customer_id,reg_date,reg_month
0,-1.0,2010-12-01 11:52:00,2010-12-01
1,12346.0,2011-01-18 10:01:00,2011-01-01
2,12347.0,2010-12-07 14:57:00,2010-12-01
3,12348.0,2010-12-16 19:09:00,2010-12-01
4,12349.0,2011-11-21 09:51:00,2011-11-01
...,...,...,...
95,12461.0,2011-07-08 10:23:00,2011-07-01
96,12462.0,2011-02-09 14:44:00,2011-02-01
97,12463.0,2011-04-12 12:47:00,2011-04-01
98,12464.0,2011-02-03 13:30:00,2011-02-01


### Check our query

In [117]:
#проверяем что не потеряли пользователей
sql = ''' select
count(distinct case when t.customer_id is null then -1 else t.customer_id end)
from ecommerce t
'''

In [118]:
select(sql)

Unnamed: 0,count
0,4373


In [119]:
#проверяем полностью запрос
sql = '''
with users as (

select

case when t.customer_id is null then -1 else t.customer_id end as customer_id,
min(t.invoice_date) as reg_date,
min(date_trunc('month', t.invoice_date)) as reg_month

from ecommerce t
group by
case when t.customer_id is null then -1 else t.customer_id end)

select count(distinct t.customer_id) from users t
limit 100
'''

In [120]:
select(sql)

Unnamed: 0,count
0,4373


### Cross join tables users and dates

In [134]:
#делаем cross join
sql = '''
with users as (
    select
    case when t.customer_id is null then -1 else t.customer_id end as customer_id,
    min(t.invoice_date) as reg_date,
    min(date_trunc('month', t.invoice_date)) as reg_month

    from ecommerce t
    group by
    case when t.customer_id is null then -1 else t.customer_id end
    ),

dates as (
    SELECT date_trunc('day', dd):: date as month
    FROM generate_series
            (( select
    min(date_trunc('month', t.invoice_date)) as min_month
    from ecommerce t)
            , ( select
    max(date_trunc('month', t.invoice_date)) as max_month
    from ecommerce t)
            , '1 day'::interval) dd
)

select t.month, u.* from dates t
join users u on 1=1
where u.customer_id = 12648.0
order by t.month
limit 100

'''

In [135]:
select(sql)

Unnamed: 0,month,customer_id,reg_date,reg_month
0,2010-12-01,12648.0,2011-07-08 11:55:00,2011-07-01
1,2010-12-02,12648.0,2011-07-08 11:55:00,2011-07-01
2,2010-12-03,12648.0,2011-07-08 11:55:00,2011-07-01
3,2010-12-04,12648.0,2011-07-08 11:55:00,2011-07-01
4,2010-12-05,12648.0,2011-07-08 11:55:00,2011-07-01
...,...,...,...,...
95,2011-03-06,12648.0,2011-07-08 11:55:00,2011-07-01
96,2011-03-07,12648.0,2011-07-08 11:55:00,2011-07-01
97,2011-03-08,12648.0,2011-07-08 11:55:00,2011-07-01
98,2011-03-09,12648.0,2011-07-08 11:55:00,2011-07-01


In [130]:
#удаляем лишние месяца до даты регистрации

In [144]:
#делаем cross join
sql = '''
with users as (
    select
    case when t.customer_id is null then -1 else t.customer_id end as customer_id,
    min(t.invoice_date) as reg_date,
    min(date_trunc('month', t.invoice_date)) as reg_month

    from ecommerce t
    group by
    case when t.customer_id is null then -1 else t.customer_id end
    ),

dates as (
    SELECT date_trunc('day', dd):: date as month
    FROM generate_series
            (( select
    min(date_trunc('month', t.invoice_date)) as min_month
    from ecommerce t)
            , ( select
    max(date_trunc('month', t.invoice_date)) as max_month
    from ecommerce t)
            , '1 day'::interval) dd
)

select t.month, u.* from dates t
join users u on t.month >= u.reg_month
where u.customer_id = 12648.0
order by t.month
limit 100

'''

In [145]:
select(sql)

Unnamed: 0,month,customer_id,reg_date,reg_month
0,2011-07-01,12648.0,2011-07-08 11:55:00,2011-07-01
1,2011-07-02,12648.0,2011-07-08 11:55:00,2011-07-01
2,2011-07-03,12648.0,2011-07-08 11:55:00,2011-07-01
3,2011-07-04,12648.0,2011-07-08 11:55:00,2011-07-01
4,2011-07-05,12648.0,2011-07-08 11:55:00,2011-07-01
...,...,...,...,...
95,2011-10-04,12648.0,2011-07-08 11:55:00,2011-07-01
96,2011-10-05,12648.0,2011-07-08 11:55:00,2011-07-01
97,2011-10-06,12648.0,2011-07-08 11:55:00,2011-07-01
98,2011-10-07,12648.0,2011-07-08 11:55:00,2011-07-01


In [150]:
#делаем template
sql = '''
with users as (
    select
    case when t.customer_id is null then -1 else t.customer_id end as customer_id,
    min(t.invoice_date) as reg_date,
    min(date_trunc('month', t.invoice_date)) as reg_month

    from ecommerce t
    group by
    case when t.customer_id is null then -1 else t.customer_id end
    ),

dates as (
    SELECT date_trunc('day', dd):: date as month
    FROM generate_series
            (( select
    min(date_trunc('month', t.invoice_date)) as min_month
    from ecommerce t)
            , ( select
    max(date_trunc('month', t.invoice_date)) as max_month
    from ecommerce t)
            , '1 day'::interval) dd
),

template as (
    select t.month, u.* from dates t
    join users u on t.month >= u.reg_month
 
)

select * from template t
where t.customer_id = 12648.0
order by t.month
limit 100

'''

In [151]:
select(sql)

Unnamed: 0,month,customer_id,reg_date,reg_month
0,2011-07-01,12648.0,2011-07-08 11:55:00,2011-07-01
1,2011-07-02,12648.0,2011-07-08 11:55:00,2011-07-01
2,2011-07-03,12648.0,2011-07-08 11:55:00,2011-07-01
3,2011-07-04,12648.0,2011-07-08 11:55:00,2011-07-01
4,2011-07-05,12648.0,2011-07-08 11:55:00,2011-07-01
...,...,...,...,...
95,2011-10-04,12648.0,2011-07-08 11:55:00,2011-07-01
96,2011-10-05,12648.0,2011-07-08 11:55:00,2011-07-01
97,2011-10-06,12648.0,2011-07-08 11:55:00,2011-07-01
98,2011-10-07,12648.0,2011-07-08 11:55:00,2011-07-01


In [1]:
data = {1: 'one', '2': 'won', True: 'true'}

In [2]:
data

{1: 'true', '2': 'won'}

In [3]:
data[1]

'true'