**Goals**:
* connect to the database
* using a case when statement to map the day of week
* visualise the data using plotly

In [1]:
# import the libraries needed

import pandas as pd
import sqlite3

import plotly.express as px

### Conect to the database

In [2]:
# connect to database

conn = sqlite3.connect('../data/prod_sample.db')

In [3]:
conn

<sqlite3.Connection at 0x12c305300>

In [4]:
# identify tables in database

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('stock_description',), ('online_retail_history',)]


### Recap: Plot the number of distinct customers per country

In [5]:
# query to identify number of customers per country

query = """select Country,
                  count(distinct CustomerID) as number_customers
          from online_retail_history
          where CustomerID is not null
          group by Country
          order by number_customers desc
        """

# save as a DataFrame
customers_per_country = pd.read_sql(query, conn)
customers_per_country.head(5) # head previews the first N rows of the DataFrame


Unnamed: 0,Country,number_customers
0,United Kingdom,3950
1,Germany,95
2,France,87
3,Spain,31
4,Belgium,25


In [14]:
# plot this as a bar chart

fig = px.bar(customers_per_country, x='Country', y='number_customers', 
             title = 'Number of distinct customers per Country')
fig.show()

In [7]:
# add title, rename labels + adjust the font

fig = px.bar(customers_per_country, 
             x='Country', 
             y='number_customers', 
             title = 'Number of Customers per Country',
             labels={
                     "number_customers": "Number of Customers"
                 })

fig.update_layout(
    font_family="Courier New",
    font_color="blue",
    title_font_family="Arial",
    title_font_color="black"
)

# fig.update_xaxes(title_font_family="Arial")
fig.update_traces(marker_color='red')

fig.show()

### Plot the Total Number of Purchases by Day of Week

In [8]:
query = """select strftime('%m', InvoiceDate) as month,
                  case cast (strftime('%w', InvoiceDate) as integer)
                      when 0 then 'Sunday'
                      when 1 then 'Monday'
                      when 2 then 'Tuesday'
                      when 3 then 'Wednesday'
                      when 4 then 'Thursday'
                      when 5 then 'Friday'
                      else 'Saturday' end as weekday,
                   sum(Quantity) as number_purchases
                from online_retail_history
            where cast(strftime('%Y', InvoiceDate) as integer) = 2011
            group by month, weekday
            order by month, number_purchases desc
"""

quantity_by_day = pd.read_sql(query, conn)
quantity_by_day.head(10)

Unnamed: 0,month,weekday,number_purchases
0,1,Friday,71918
1,1,Tuesday,60489
2,1,Wednesday,56530
3,1,Thursday,52269
4,1,Monday,46837
5,1,Sunday,20923
6,2,Tuesday,69522
7,2,Thursday,54650
8,2,Monday,53484
9,2,Wednesday,46115


In [None]:
fig = px.bar(???, x="???", y="???", animation_frame="month", title = 'Number of Purchases by Weekday')
fig.show()



In [9]:
fig = px.bar(quantity_by_day, x="weekday", y="number_purchases", animation_frame="month", 
             title = 'Number of Purchases by Weekday')
fig.show()

Question: What patterns are we seeing? Which day tends to have the highest number of sales?
- Sunday has the lowest number of purchases across all months

### Number of Transactions Per Month in 2011 for top 5 markets

In [10]:
# option 1 - use where and IN to get the data for countries with the most customers

query = """
            select Country,
                   strftime('%m', date(InvoiceDate)) AS invoice_month,
                   count(*) as number_invoices
            from online_retail_history as t1
            where strftime('%Y', date(InvoiceDate)) = '2011'
                and Country in ('United Kingdom', 'Germany', 'France', 'Spain', 'Belgium')
            group by Country, invoice_month 
"""

number_invoices_per_month = pd.read_sql(query, conn)
number_invoices_per_month.head(10)


Unnamed: 0,Country,invoice_month,number_invoices
0,Belgium,1,63
1,Belgium,2,120
2,Belgium,3,163
3,Belgium,4,123
4,Belgium,5,149
5,Belgium,6,231
6,Belgium,7,129
7,Belgium,8,197
8,Belgium,9,185
9,Belgium,10,267


In [11]:
# option 2 , pass in a query to identify the top 5 markets 

query = """
            select Country,
                   strftime('%m', date(InvoiceDate)) AS invoice_month,
                   count(*) as number_invoices
            from online_retail_history as t1
            where strftime('%Y', date(InvoiceDate)) = '2011'
                and Country in (select Country
                          from online_retail_history
                          where CustomerID is not null
                          group by Country
                          order by count(distinct CustomerID) desc
                          limit 5)
            group by Country, invoice_month 
"""

number_invoices_per_month = pd.read_sql(query, conn)
number_invoices_per_month.head(10)


Unnamed: 0,Country,invoice_month,number_invoices
0,Belgium,1,63
1,Belgium,2,120
2,Belgium,3,163
3,Belgium,4,123
4,Belgium,5,149
5,Belgium,6,231
6,Belgium,7,129
7,Belgium,8,197
8,Belgium,9,185
9,Belgium,10,267


In [12]:
fig = px.bar(number_invoices_per_month, x="invoice_month", y="number_invoices", 
             color="Country", title = 'Number of Invoices per Month by Country')
fig.show()

In [13]:
fig = px.bar(number_invoices_per_month, x="invoice_month", y="number_invoices", 
  animation_frame="Country", range_y=[0,60000], title = 'Number of Invoices per Month by Country')
fig.show()

What patterns are we seeing??