# Grouping Data with SQL

In [1]:
import sqlite3
import pandas as pd

In [2]:
conn = sqlite3.Connection('data.sqlite')

In [4]:
q= """SELECT country,
COUNT(*)
FROM customers
GROUP BY country;
"""

pd.read_sql(q,conn).head(10)

Unnamed: 0,country,COUNT(*)
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


In [7]:
#WE can also do the above using an alternative syntax

q1="""
SELECT country,COUNT(*)
FROM customers
GROUP BY 1;"""
pd.read_sql(q1,conn).head(10)

Unnamed: 0,country,COUNT(*)
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


### Aliasing

In [8]:
q= """
SELECT country,COUNT(*) AS customer_count
FROM customers
GROUP BY country;"""

pd.read_sql(q,conn).head(10)

Unnamed: 0,country,customer_count
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


Other aggregation include MIN(),SUM(),SUM(),AVG() which can be used to perform summary statistics

In [10]:
q="""
SELECT
      customerNumber,
      COUNT(*) AS number_payments,
      MIN(amount) AS min_purchase,
      MAX(amount) AS max_purchase,
      AVG(amount) AS avg_purchase,
      SUM(amount) AS total_spent
FROM payments
GROUP BY customerNumber;"""

pd.read_sql(q,conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,103,3,14571.44,6066.78,7438.120000,22314.36
1,112,3,14191.12,33347.88,26726.993333,80180.98
2,114,4,44894.74,82261.22,45146.267500,180585.07
3,119,3,19501.82,49523.67,38983.226667,116949.68
4,121,4,1491.38,50218.95,26056.197500,104224.79
...,...,...,...,...,...,...
93,486,3,25833.14,5899.38,25908.863333,77726.59
94,487,2,12573.28,29997.09,21285.185000,42570.37
95,489,2,22275.73,7310.42,14793.075000,29586.15
96,495,2,59265.14,6276.60,32770.870000,65541.74


### Filtered Payment Summary Statistics with `WHERE`

we can use the WHERE to filter the stistics anin this case we can decide to filter the data by obtaing only data which is in 2004 using the `strftime`

In [11]:
q = """
SELECT
    customerNumber,
    COUNT(*) AS number_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
WHERE strftime('%Y', paymentDate) = '2004'
GROUP BY customerNumber
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,103,2,1676.14,6066.78,3871.460,7742.92
1,112,2,14191.12,33347.88,23769.500,47539.00
2,114,2,44894.74,82261.22,63577.980,127155.96
3,119,2,19501.82,47924.19,33713.005,67426.01
4,121,2,17876.32,34638.14,26257.230,52514.46
...,...,...,...,...,...,...
83,486,2,45994.07,5899.38,25946.725,51893.45
84,487,1,12573.28,12573.28,12573.280,12573.28
85,489,1,7310.42,7310.42,7310.420,7310.42
86,495,1,6276.60,6276.60,6276.600,6276.60


## The `HAVING` Clause

Finally, we can also filter our aggregated views with the `HAVING` clause. The `HAVING` clause works similarly to the `WHERE` clause, except it is used to filter data selections on conditions **after** the `GROUP BY` clause

In [12]:
q = """
SELECT
    customerNumber,
    COUNT(*) AS number_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
GROUP BY customerNumber
HAVING avg_purchase > 50000
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,124,9,101244.59,85410.87,64909.804444,584188.24
1,141,13,116208.4,65071.26,55056.844615,715738.98
2,239,1,80375.24,80375.24,80375.24,80375.24
3,298,2,47375.92,61402.0,54388.96,108777.92
4,321,2,46781.66,85559.12,66170.39,132340.78
5,450,1,59551.38,59551.38,59551.38,59551.38


Difference between the `WHERE` and the `HAVING` is that where is used before the `GROUP BY` and the `HAVING` is used after

An example of using the both of them is:

In [13]:
q = """
SELECT
    customerNumber,
    COUNT(*) AS number_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
WHERE amount > 50000
GROUP BY customerNumber
HAVING number_payments >= 2
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,103,3,14571.44,6066.78,7438.120000,22314.36
1,112,3,14191.12,33347.88,26726.993333,80180.98
2,114,4,44894.74,82261.22,45146.267500,180585.07
3,119,3,19501.82,49523.67,38983.226667,116949.68
4,121,4,1491.38,50218.95,26056.197500,104224.79
...,...,...,...,...,...,...
89,486,3,25833.14,5899.38,25908.863333,77726.59
90,487,2,12573.28,29997.09,21285.185000,42570.37
91,489,2,22275.73,7310.42,14793.075000,29586.15
92,495,2,59265.14,6276.60,32770.870000,65541.74


In [14]:
q = """
SELECT
    customerNumber,
    COUNT(*) AS number_payments,
    MIN(amount) AS min_purchase,
    MAX(amount) AS max_purchase,
    AVG(amount) AS avg_purchase,
    SUM(amount) AS total_spent
FROM payments
WHERE amount > 50000
GROUP BY customerNumber
HAVING number_payments >= 2
ORDER BY total_spent
LIMIT 1
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,219,2,3452.75,4465.85,3959.3,7918.6
