# Problem 1

This diagram demonstrates the steps needs to be taken in order to generate the requrested queries:

![Data Pipeline](assets/Problem_1_data_pipeline.png)

The implementation of the diagram in SQL syntax for SQLite database is presented in following cells.

**Note**: Since the database is in a zipped archive, we need to create a wrapper to temporarily extract the zipfile, and load the database. 
By connecting to the database inside a context manager, we make sure that database connection is closed properly.

In [1]:
import pandas as pd
import sqlite3
import zipfile
import tempfile
import os

zip_path = 'mock_resq.db.zip'
db_name = 'mock_resq.db'

class TempDirWrapper:
    """Wrapper for temporary extraction of database zipfile
    """
    
    def __init__(self, zip_path:str, db_name:str):
        self.zip_path = zip_path
        self.db_name = db_name
        self.temp_dir = None
        self.db_path = None

    def __enter__(self) -> str:
        self.temp_dir = tempfile.TemporaryDirectory()
        try:
            with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
                zip_ref.extractall(self.temp_dir.name)
            self.db_path = os.path.join(self.temp_dir.name, self.db_name)
            return self.db_path
        except Exception as e:
            self.__exit__(None, None, e)
            raise

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.temp_dir:
            self.temp_dir.cleanup()

In [2]:
query = """
SELECT
    orders.'providerId',
    SUM(orders.sales) AS sales_total
FROM orders
GROUP BY providerId
ORDER BY SUM(sales) DESC
LIMIT 10;
"""

with TempDirWrapper(zip_path=zip_path, db_name=db_name) as db_path:
    # Use db_path here
    with sqlite3.connect(db_path) as conn:

        df = pd.read_sql_query(sql=query, con=conn)
                
df

Unnamed: 0,providerId,sales_total
0,7198110370745783236,10917800
1,8312310143652755348,7467750
2,8097235958083241788,2383700
3,3865474760205653333,2223400
4,8084884958338058541,1868140
5,4734853230275691017,1702100
6,5305286819167536850,1690500
7,1066258454353124935,1568100
8,7642201963087705313,1472000
9,4014236829817167297,1457000


In [3]:
query = """
SELECT 
    providers.'defaultOfferType' AS offer_type,
    COUNT(orders.'id') AS count_orders
FROM orders
LEFT JOIN providers ON providers.'id' = orders.'providerId'
GROUP BY offer_type
ORDER BY count_orders DESC;
"""
with TempDirWrapper(zip_path=zip_path, db_name=db_name) as db_path:
    # Use db_path here
    with sqlite3.connect(db_path) as conn:

        df = pd.read_sql_query(sql=query, con=conn)
                
df

Unnamed: 0,offer_type,count_orders
0,meal,219764
1,snack,49861
2,grocery-bag,27192
3,dessert,1910
4,ingredients,1091
5,flowers,153


In [4]:
query ="""
DROP TABLE IF EXISTS tempOrder;
CREATE TEMP TABLE tempOrder AS
SELECT *,
    ROW_NUMBER() OVER (PARTITION BY userId ORDER BY createdAt) AS order_row
FROM orders;

UPDATE tempOrder
SET createdAt = DATE(createdAt);



DROP TABLE IF EXISTS tempOrderCount;
CREATE TEMP TABLE tempOrderCount AS
SELECT *,
	MIN(createdAt) OVER (PARTITION BY userId) AS first_order_date
FROM tempOrder;


DROP TABLE IF EXISTS tempOrder;
CREATE TEMP TABLE tempOrder AS
SELECT *,
	JULIANDAY(createdAt) - JULIANDAY(first_order_date) AS days_difference 
FROM tempOrderCount
WHERE (order_row = 2);

DROP TABLE IF EXISTS userRetention;


CREATE TEMP TABLE userRetention AS
SELECT userId,
	(
    CASE
        WHEN (days_difference <= 30) THEN 'M0'
        WHEN (days_difference > 30) AND (days_difference < 62) THEN 'M1'
        WHEN (days_difference <= 30) THEN 'M0'
        ELSE 'M2+'
    END
	) AS retention
FROM tempOrder;

"""

with TempDirWrapper(zip_path=zip_path, db_name=db_name) as db_path:
    # Use db_path here
    with sqlite3.connect(db_path) as conn:
        try:
            cur = conn.cursor()
            cur.executescript(query)
            conn.commit()
            print("Customer Retention classification completed.")


            df = pd.read_sql_query(sql="SELECT * FROM userRetention", con=conn)
            print(df)

        except sqlite3.Error as e:
            print(f"An error occurred: {e}")
df.retention.value_counts()

Customer Retention classification completed.
                    userId retention
0          219034330643057       M2+
1          406087302631582        M0
2          472654236140424       M2+
3          494869758008189       M2+
4          857319303011182        M1
...                    ...       ...
56742  9223090445069821952        M1
56743  9223097207332177627        M1
56744  9223113765943846674       M2+
56745  9223115497170789549        M0
56746  9223201399781576886       M2+

[56747 rows x 2 columns]


retention
M2+    24244
M0     22475
M1     10028
Name: count, dtype: int64