**Goals**:
* connect to the database
* sql syntax recap


In [1]:
# import the libraries needed

import pandas as pd
import sqlite3


### Conect to the database

In [2]:
# connect to database

conn = sqlite3.connect('../data/prod_sample.db')

In [3]:
conn

<sqlite3.Connection at 0x12718f6c0>

In [4]:
# identify tables in database

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('stock_description',), ('online_retail_history',)]


## Activity: SQL Refresher

In [5]:
# remind yourself what the data looks like
# select all columns and the first five rows from each table

query = """select *
           from online_retail_history
           limit 5
"""

pd.read_sql(query, conn)

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom
1,536365,71053,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
2,536365,84406B,8,2010-12-01 08:26:00,2.75,u1785,United Kingdom
3,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
4,536365,84029E,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom


In [6]:
query = """select *
           from stock_description
           limit 5
"""

pd.read_sql(query, conn)

Unnamed: 0,StockCode,Description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE


In [7]:
# how many orders have been placed?

query = """select count(distinct Invoice) as number_orders
           from online_retail_history
"""

pd.read_sql(query, conn)

Unnamed: 0,number_orders
0,25900


In [8]:
# what is the min, max and average price of items sold in Germany?

query = """select min(Price) as min_price,
                  max(Price) as max_price,
                  round(avg(Price), 2) as avg_price
           from online_retail_history
           where country = 'Germany'
"""

pd.read_sql(query, conn)

Unnamed: 0,min_price,max_price,avg_price
0,0.0,599.5,3.97


In [23]:
# what is the stock code of the most expensive item sold in Germany?
# option 1

query = """select distinct StockCode
           from online_retail_history
           where country = 'Germany' and Price = 599.5
"""

pd.read_sql(query, conn)


Unnamed: 0,StockCode
0,M


In [25]:
# option 2 - pass in a sub query which identifies the most expensive price

query = """select distinct StockCode
           from online_retail_history
           where country = 'Germany' and Price = (
                                                  select max(Price)
                                                  from online_retail_history
                                                  where country = 'Germany')
"""

pd.read_sql(query, conn)


Unnamed: 0,StockCode
0,M


In [9]:
# what is the description of the most expensive stock code sold in Germany?

query = """select Description
           from stock_description
           where StockCode = 'M'
"""

pd.read_sql(query, conn)

Unnamed: 0,Description


In [10]:
# which Country has the highest average Price of items sold?

query = """select Country, 
                  round(avg(Price) , 2) as avg_price
           from online_retail_history
           group by Country
           order by avg(Price) desc
           limit 1
"""

pd.read_sql(query, conn)


Unnamed: 0,Country,avg_price
0,Singapore,109.65


In [16]:
# how many stocks in the online_retail_history table do not have a description?
# query that joins the description to the online_retail_history table

query = """select t1.StockCode,
                  t2.Description
           from online_retail_history as t1
           left join stock_description as t2 on t1.StockCode = t2.StockCode
           where Description is null
"""

pd.read_sql(query, conn)

Unnamed: 0,StockCode,Description
0,D,
1,21705,
2,46000M,
3,21134,
4,21703,
...,...,...
2374,21703,
2375,21704,
2376,M,
2377,21705,


In [24]:
# how many stocks in the online_retail_history table do not have a description?

query = """select count(*) as number_stocks_no_description
           from online_retail_history as t1
           left join stock_description as t2 on t1.StockCode = t2.StockCode
           where t2.Description is null
"""

pd.read_sql(query, conn)

Unnamed: 0,number_stocks_no_description
0,2379


### Activity: Total Number of Purchases by Day of Week

In [21]:
# query to extract the year, month and dow

query = """select InvoiceDate,
                  cast(strftime('%Y', InvoiceDate) as integer) as year,
                  cast(strftime('%m', InvoiceDate) as integer) as month,
                  cast (strftime('%w', InvoiceDate) as integer) as dow
           from online_retail_history"""

pd.read_sql(query, conn)

Unnamed: 0,InvoiceDate,year,month,dow
0,2010-12-01 08:26:00,2010,12,3
1,2010-12-01 08:26:00,2010,12,3
2,2010-12-01 08:26:00,2010,12,3
3,2010-12-01 08:26:00,2010,12,3
4,2010-12-01 08:26:00,2010,12,3
...,...,...,...,...
541905,2011-12-09 12:50:00,2011,12,5
541906,2011-12-09 12:50:00,2011,12,5
541907,2011-12-09 12:50:00,2011,12,5
541908,2011-12-09 12:50:00,2011,12,5


In [None]:
query = """select cast(strftime('%m', InvoiceDate) as integer) as month,
                  case cast (strftime('%w', InvoiceDate) as integer)
                      when 0 then 'Sunday'
                      when 1 then 'Monday'
                      when 2 then '???'
                      when 3 then '???'
                      when 4 then '???'
                      when 5 then '???'
                      else 'Saturday' end as weekday,
                   sum(Quantity) as number_purchases
                from online_retail_history
            where cast(strftime('%Y', InvoiceDate) as integer) = ????
            group by month, weekday
            order by month, number_purchases desc
"""

pd.read_sql(query, conn)

In [22]:
query = """select cast(strftime('%m', InvoiceDate) as integer) as month,
                  case cast (strftime('%w', InvoiceDate) as integer)
                      when 0 then 'Sunday'
                      when 1 then 'Monday'
                      when 2 then 'Tuesday'
                      when 3 then 'Wednesday'
                      when 4 then 'Thursday'
                      when 5 then 'Friday'
                      else 'Saturday' end as weekday,
                   sum(Quantity) as number_purchases
                from online_retail_history
            where cast(strftime('%Y', InvoiceDate) as integer) = 2011
            group by month, weekday
            order by month, number_purchases desc
"""

pd.read_sql(query, conn)

Unnamed: 0,month,weekday,number_purchases
0,1,Friday,71918
1,1,Tuesday,60489
2,1,Wednesday,56530
3,1,Thursday,52269
4,1,Monday,46837
...,...,...,...
67,12,Monday,44119
68,12,Friday,39902
69,12,Wednesday,39612
70,12,Tuesday,29026
