**Goals**:
* connect to the database
* query the tables in the database
* **Activity**: basic sql queries to explore the data
* Create one dataset which contains the descriptions of the stock


In [6]:
# import the libraries needed

import pandas as pd
import sqlite3

### Conect to the database

In [7]:
# connect to database

conn = sqlite3.connect('../data/prod_sample.db')

In [8]:
conn

<sqlite3.Connection at 0x124d8c5d0>

In [9]:
# identify tables in database

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('online_retail_history',), ('stock_description',)]


### Query the Tables 

Table 1: Online Retail History

In [10]:
# write a query to select the first ten rows from the online_retail_history table

query = """select *
           from online_retail_history
           limit 10"""

query

'select *\n           from online_retail_history\n           limit 10'

In [11]:
# read the query and save it as a DataFrame called online_retail

online_retail = pd.read_sql(query, conn)
online_retail # this is the first ten rows of the online_retail_history table

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,CustomerID,Country
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom
1,536365,71053,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
2,536365,84406B,8,2010-12-01 08:26:00,2.75,u1785,United Kingdom
3,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
4,536365,84029E,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
5,536365,22752,2,2010-12-01 08:26:00,7.65,u1785,United Kingdom
6,536365,21730,6,2010-12-01 08:26:00,4.25,u1785,United Kingdom
7,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom
8,536366,22632,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom
9,536368,22960,6,2010-12-01 08:34:00,4.25,u13047,United Kingdom


In [30]:
query = """select StockCode, 
                  count(*) as number_transactions
           from online_retail_history
           group by StockCode
           order by 2 desc
           limit 1
"""

pd.read_sql(query, conn)

Unnamed: 0,StockCode,number_transactions
0,85123A,2313


Table 2: Stock Description

In [12]:
# write a query to select the first ten rows from the stock_description table

query = """select *
          from stock_description
          limit 10
"""

stock_description = pd.read_sql(query, conn)

In [13]:
# preview the data

stock_description

Unnamed: 0,StockCode,Description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE
5,10124G,ARMY CAMO BOOKCOVER TAPE
6,10125,MINI FUNKY DESIGN TAPES
7,10133,COLOURING PENCILS BROWN TUBE
8,10135,COLOURING PENCILS BROWN TUBE
9,11001,ASSTD DESIGN RACING CAR PEN


In [14]:
# How many rows (total number of items) does the online retail history table contain?

# write the query

query = """select count(*) as number_rows
           from online_retail_history
        """

pd.read_sql(query, conn)

Unnamed: 0,number_rows
0,541910


In [15]:
# How many rows (total number of items) does the stock description table contain?

# write the query

query = """select count(*) as number_rows
          from stock_description
        """

pd.read_sql(query, conn)

Unnamed: 0,number_rows
0,3952


### Activity: Basic Exploration of the data using SQL 

In [20]:
query = """select count(distinct CustomerID)
           from online_retail_history
            """

pd.read_sql(query, conn)

Unnamed: 0,count(distinct CustomerID)
0,4372


In [11]:
# Q1. How many customers does the dataset contain?

# write the query

query = """select count(distinct CustomerID) as number_customers
          from online_retail_history
"""

pd.read_sql(query, conn)

Unnamed: 0,number_customers
0,4372


In [12]:
# Q2. How many invoices does the table contain?

# write the query

query = """select count(distinct Invoice) as number_invoices
          from online_retail_history
"""

pd.read_sql(query, conn)

Unnamed: 0,number_invoices
0,25900


In [13]:
# Q3. When was the first and last invoice?

# write the query

query = """select min(InvoiceDate) as min_invoice_date,
                  max(InvoiceDate) as max_invoice_date
          from online_retail_history
"""

pd.read_sql(query, conn)

# We have one year of invoices - ranging from 1st Dec 2010 to 9th Dec 20111

Unnamed: 0,min_invoice_date,max_invoice_date
0,2010-12-01 08:26:00,2011-12-09 12:50:00


In [14]:
# Q4. How many different types of stocks did the customers purchase?

query = """select count(distinct StockCode) as number_stocks
          from online_retail_history
"""

pd.read_sql(query, conn)

Unnamed: 0,number_stocks
0,4070


In [15]:
# Q5. What was the most popular stock?

query = """select StockCode,
                  count(*) as number_stocks
          from online_retail_history
          group by StockCode
          order by 2 desc
          limit 1
"""

pd.read_sql(query, conn)

Unnamed: 0,StockCode,number_stocks
0,85123A,2313


In [31]:
# Q6. What is the description of the most popular stock?
# OPTION 1
# Query the table stock_description

query = """select description
           from stock_description
           where StockCode = '85123A'"""

pd.read_sql(query, conn)

Unnamed: 0,Description
0,CREAM HANGING HEART T-LIGHT HOLDER


In [16]:
# Q6. What is the description of the most popular stock?
# OPTION 2
# Query the table stock_description

query = """select *
           from stock_description
           where StockCode in (select StockCode
                              from (select StockCode,
                                    count(*) as number_stocks
                                    from online_retail_history
                                    group by 1
                                    order by 2 desc
                                    limit 1
                                    )
                              )
"""

pd.read_sql(query, conn)

# The most popular stock has two descriptions - one being a ?

Unnamed: 0,StockCode,Description
0,85123A,CREAM HANGING HEART T-LIGHT HOLDER


In [17]:
# Q7. How many Stocks does the "stock description" table contain?

query = """
          select count(distinct StockCode) as number_stocks
          from stock_description
"""

pd.read_sql(query, conn)

# We identified 4070 stocks in the dataset

Unnamed: 0,number_stocks
0,3905


In [18]:
# compare against how many stocks the online retail history table contains. 
# We may have some problems when joining the two tables together

query = """
          select count(distinct StockCode) as number_stocks
          from online_retail_history
"""

pd.read_sql(query, conn)

Unnamed: 0,number_stocks
0,4070


In [19]:
# Q8. How many Stocks contain ? in the Description 

query = """select count(*) as number_question_marks
           from stock_description
           where Description like '%?%'
"""

pd.read_sql(query, conn)

Unnamed: 0,number_question_marks
0,48


In [20]:
# Q9. Identify all the stocks that contain ? in the Description

query = """select StockCode,
                  Description
           from stock_description
           where Description like '%?%' 
"""

pd.read_sql(query, conn)

Unnamed: 0,StockCode,Description
0,16020C,?
1,16207B,?
2,21145,?
3,21232,?
4,21368,?
5,21427,?
6,21446,?
7,21591,?
8,21622,POSSIBLE DAMAGES OR LOST?
9,21877,?


In [21]:
query = """select StockCode,
                  Description
           from stock_description
           where Description like '%?%' and Description <> '?'
"""

pd.read_sql(query, conn)


Unnamed: 0,StockCode,Description
0,21622,POSSIBLE DAMAGES OR LOST?
