**Goals**:
* connect to the database
* query the tables in the database
* **Activity**: basic sql queries to explore the data
* create a dataset which contains the descriptions of the stock

In [1]:
# import the libraries needed

import pandas as pd
import psycopg2

### Conect to the database

In [2]:
# connect to database

dbname = "dev"
host = "redshift-cluster-1.c9gt5btzchps.eu-central-1.redshift.amazonaws.com"
port = 5439
user = ""
password = ""

# connect to redshift

conn = psycopg2.connect(
        dbname=dbname, host=host, port=port, user=user, password=password
    )

### Query the Tables 

Table 1: Online Retail History

In [3]:
query = """select *
            from bootcamp.online_transactions"""

query

'select *\n            from bootcamp.online_transactions'

In [4]:
online_retail = pd.read_sql(query, conn)



  online_retail = pd.read_sql(query, conn)


In [5]:
# write query to select online retail history data

query = """select *
          from bootcamp.online_transactions
"""

In [6]:
# use the read_sql function in pandas to read a query into a DataFrame

online_retail = pd.read_sql(query, conn)


  online_retail = pd.read_sql(query, conn)


In [7]:
# preview the data

online_retail.head(n = 3)

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country
0,536365,84029G,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom
1,536366,22633,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom
2,536368,22912,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom


In [8]:
# structure of the data

online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   invoice       541910 non-null  object 
 1   stock_code    541910 non-null  object 
 2   quantity      541910 non-null  int64  
 3   invoice_date  541910 non-null  object 
 4   price         541910 non-null  float64
 5   customer_id   541910 non-null  object 
 6   country       541910 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 28.9+ MB


In [9]:
# size of the data

online_retail.shape

# 540,000 rows of of data and 7 columns

(541910, 7)

Table 2: Stock Description

In [10]:
query = """select *
          from bootcamp.stock_description
"""

stock_description = pd.read_sql(query, conn)

  stock_description = pd.read_sql(query, conn)


In [11]:
# preview the data

stock_description.head(n = 3)

Unnamed: 0,stock_code,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER


In [12]:
# structure of the data

stock_description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3952 entries, 0 to 3951
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   stock_code   3952 non-null   object
 1   description  3952 non-null   object
dtypes: object(2)
memory usage: 61.9+ KB


In [13]:
# size of the data

stock_description.shape

# 4,000 rows of of data and 2 columns

(3952, 2)

### Optional Activity: Basic Exploration of the data using SQL 

In [14]:
# Q1. How many rows (total number of items) does the online retail history dataset contain?

# write the query

query = """select count(*) as number_rows
          from bootcamp.online_transactions
        """

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_rows
0,541910


In [15]:
# Q2. How many customers does the dataset contain?

# write the query

query = """select count(distinct customer_id) as number_customers
          from bootcamp.online_transactions
"""

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_customers
0,4373


In [16]:
# Q3. How many invoices does the dataset contain?

# write the query

query = """select count(distinct Invoice) as number_invoices
          from bootcamp.online_transactions
"""

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_invoices
0,25900


In [17]:
# Q4. When was the first and last invoice?

# write the query

query = """select min(invoice_date) as min_invoice_date,
                  max(invoice_date) as max_invoice_date
          from bootcamp.online_transactions
"""

pd.read_sql(query, conn)

# We have one year of invoices - ranging from 1st Dec 2010 to 9th Dec 20111

  pd.read_sql(query, conn)


Unnamed: 0,min_invoice_date,max_invoice_date
0,2010-12-01 08:26:00,2011-12-09 12:50:00


In [18]:
# Q5. How many invoices did we have per month? 

query = """select cast(invoice_date as Datetime) as invoice_month,
                  count(*) as number_invoices
           from bootcamp.online_transactions
           group by invoice_month
        """

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,invoice_month,number_invoices
0,2010-12-01 09:02:00,16
1,2010-12-01 09:32:00,18
2,2010-12-01 09:37:00,19
3,2010-12-01 09:45:00,12
4,2010-12-01 09:59:00,14
...,...,...
23255,2011-12-02 12:17:00,1
23256,2011-12-05 10:34:00,1
23257,2011-12-05 11:36:00,1
23258,2011-12-06 12:23:00,1


In [19]:
# Q6. How many different types of stocks did the customers purchase?

query = """select count(distinct stock_code) as number_stocks
          from bootcamp.online_transactions
"""

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_stocks
0,4070


In [20]:
# Q7. What was the most popular stock?

query = """select stock_code,
                  count(*) as number_stocks
          from bootcamp.online_transactions
          group by 1
          order by 2 desc
"""

pd.read_sql(query, conn).head(10)

  pd.read_sql(query, conn).head(10)


Unnamed: 0,stock_code,number_stocks
0,85123A,2313
1,22423,2203
2,85099B,2159
3,47566,1727
4,20725,1639
5,84879,1502
6,22720,1477
7,22197,1476
8,21212,1385
9,20727,1350


In [21]:
query = """select stock_code,
                  count(*)
           from bootcamp.online_transactions
           group by stock_code
           order by count(*) desc
           limit 1
"""
pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,stock_code,count
0,85123A,2313


In [22]:
# Q8. What is the description of the most popular stock?
# Query the table stock_description

query = """select *
           from bootcamp.stock_description
           where stock_code in (select stock_code
                              from (select stock_code,
                                    count(*) as number_stocks
                              from bootcamp.online_transactions
                              group by 1
                              order by 2 desc)
                              limit 1
                              )
"""

pd.read_sql(query, conn)

# The most popular stock has two descriptions - one being a ?

  pd.read_sql(query, conn)


Unnamed: 0,stock_code,description
0,85123A,CREAM HANGING HEART T-LIGHT HOLDER


In [23]:
# Q9. How many Stocks have ? as the Description 

query = """select count(*) as number_question_marks
           from bootcamp.stock_description
           where Description = '?'
"""

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_question_marks
0,47


In [24]:
# Q10. How many Stocks does the "stock description" dataset contain?

query = """
          select count(distinct stock_code) as number_stocks
          from bootcamp.stock_description
"""

pd.read_sql(query, conn)

# We identified 4070 stocks in the dataset

  pd.read_sql(query, conn)


Unnamed: 0,number_stocks
0,3905


In [25]:
query = """
          select count(distinct stock_code) as number_stocks
          from bootcamp.online_transactions
"""

pd.read_sql(query, conn)

  pd.read_sql(query, conn)


Unnamed: 0,number_stocks
0,4070


### Aggregate the Online Retail History and Stock Description Dataset

In [26]:
query = """
        select t1.*,
               t2.Description
        from bootcamp.online_transactions as t1
        /* remove stocks with ? as the description */
        left join (select *
                   from bootcamp.stock_description
                   where description <> '?') as t2 on t1.stock_code = t2.stock_code
    
"""

online_history_w_description = pd.read_sql(query, conn)
online_history_w_description.head()

  online_history_w_description = pd.read_sql(query, conn)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,84406B,8,2010-12-01 08:26:00,2.75,u1785,United Kingdom,CREAM CUPID HEARTS COAT HANGER
1,536365,21730,6,2010-12-01 08:26:00,4.25,u1785,United Kingdom,GLASS STAR FROSTED T-LIGHT HOLDER
2,536368,22913,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,RED COAT RACK PARIS FASHION
3,536367,22745,6,2010-12-01 08:34:00,2.1,u13047,United Kingdom,POPPY'S PLAYHOUSE BEDROOM
4,536367,84969,6,2010-12-01 08:34:00,4.25,u13047,United Kingdom,BOX OF 6 ASSORTED COLOUR TEASPOONS


In [27]:
# check shape of the data

online_history_w_description.shape

# 54,000 rows of data with 8 columns 

(541910, 8)

In [28]:
# confirm the shape of the data has the same number of rows as the online retail history dataset

online_retail.shape

(541910, 7)

In [30]:
# confirm there are no ? in the dataset

online_history_w_description[online_history_w_description.description == '?']

# Expected output: Blank DataFrame

Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description


In [31]:
online_history_w_description.describe()

Unnamed: 0,quantity,price
count,541910.0,541910.0
mean,9.552234,4.611138
std,218.080957,96.759765
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [32]:
query = """
        select t1.*,
               t2.Description
        from bootcamp.online_transactions as t1
        /* remove stocks with ? as the description */
        left join (select *
                   from bootcamp.stock_description
                   where description <> '?') as t2 on t1.stock_code = t2.stock_code
         /*known data error where items have negative quantity*/
         where quantity > 0
"""

online_history_w_description = pd.read_sql(query, conn)
online_history_w_description.head()

  online_history_w_description = pd.read_sql(query, conn)


Unnamed: 0,invoice,stock_code,quantity,invoice_date,price,customer_id,country,description
0,536365,85123A,6,2010-12-01 08:26:00,2.55,u1785,United Kingdom,CREAM HANGING HEART T-LIGHT HOLDER
1,536365,84029E,6,2010-12-01 08:26:00,3.39,u1785,United Kingdom,RED WOOLLY HOTTIE WHITE HEART.
2,536366,22632,6,2010-12-01 08:28:00,1.85,u1785,United Kingdom,HAND WARMER RED POLKA DOT
3,536368,22914,3,2010-12-01 08:34:00,4.95,u13047,United Kingdom,BLUE COAT RACK PARIS FASHION
4,536367,22749,8,2010-12-01 08:34:00,3.75,u13047,United Kingdom,FELTCRAFT PRINCESS CHARLOTTE DOLL


In [33]:
# check shape of data

online_history_w_description.shape

(531286, 8)

In [None]:
# close any connections to the DB

conn.close()