https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html

data types https://www.w3schools.com/sql/sql_dates.asp

Kaggle SQL https://www.kaggle.com/jsab16/sql-and-python-analysis-on-world-indicators

DATE - format YYYY-MM-DD

I've got the raw data in some csv, needs some cleaning. Place in pandas dataframe
Create tables using sqlalchemy and insert data.

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy import MetaData, Column, Table, ForeignKey
from sqlalchemy import Integer, String, Date, Float
from datetime import datetime, timedelta

We will need to create datetimes

In [2]:
now = datetime.now()
print(now.strftime("%Y-%m-%d"))

2018-01-01


In [3]:
(now - timedelta(days=5))

datetime.datetime(2017, 12, 27, 12, 16, 18, 372940)

Lets create the database

In [4]:
engine = create_engine('sqlite:///data.db', echo=True)

In [5]:
conn = engine.connect()

2018-01-01 12:16:18,433 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2018-01-01 12:16:18,477 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,484 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2018-01-01 12:16:18,486 INFO sqlalchemy.engine.base.Engine ()


In [6]:
metadata = MetaData(bind=engine)
 
product_table = Table('product', metadata,
                    Column('product_id', Integer, primary_key=True),
                    Column('name', String(128)),
                    Column('rrp', Float),
                    Column('available_from', Date),
                    )
 
orders_table = Table('orders', metadata,
                     Column('order_id', Integer, primary_key=True),
                     Column('product_id', Integer, ForeignKey('product.product_id')),
                     Column('quantity', Integer),
                     Column('order_price', Float),
                     Column('dispatch_date', Date),            
                    )
 
# create tables in database
metadata.create_all()

2018-01-01 12:16:18,582 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("product")
2018-01-01 12:16:18,584 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,591 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("orders")
2018-01-01 12:16:18,596 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,599 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE product (
	product_id INTEGER NOT NULL, 
	name VARCHAR(128), 
	rrp FLOAT, 
	available_from DATE, 
	PRIMARY KEY (product_id)
)


2018-01-01 12:16:18,604 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,617 INFO sqlalchemy.engine.base.Engine COMMIT
2018-01-01 12:16:18,623 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE orders (
	order_id INTEGER NOT NULL, 
	product_id INTEGER, 
	quantity INTEGER, 
	order_price FLOAT, 
	dispatch_date DATE, 
	PRIMARY KEY (order_id), 
	FOREIGN KEY(product_id) REFERENCES product (product_id)
)


2018-01-01 12:16:18,626 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,633 INF

All looks good

### Product
Need to do some formatting on csv data

In [7]:
product = pd.read_csv('product.csv')
product

Unnamed: 0,product_id,name,rrp,available_from_text
0,101,\t Bayesian Methods for Nonlinear Classificati...,94.95,\t (last thursday)
1,102,\t (next year) in Review (preorder),21.95,\t (next year)
2,103,\t Learn Python in Ten Minutes,2.15,(three months ago)
3,104,\t sports almanac (1999-2049),3.38,\t (2 years ago)
4,105,\t finance for dummies,84.99,\t (1 year ago)


Lets turn available_from_text into available_from in datetime strings

In [8]:
product['available_from'] = "NULL"  # Create the available_from column

In [9]:
product.at[0, 'available_from'] = (now - timedelta(days=6)).strftime("%Y-%m-%d")
product.at[1, 'available_from'] = (now + timedelta(days=365)).strftime("%Y-%m-%d")
product.at[2, 'available_from'] = (now - timedelta(days=3*30)).strftime("%Y-%m-%d")
product.at[3, 'available_from'] = (now - timedelta(days=365*2)).strftime("%Y-%m-%d")
product.at[4, 'available_from'] = (now - timedelta(days=365*1)).strftime("%Y-%m-%d")

In [10]:
product

Unnamed: 0,product_id,name,rrp,available_from_text,available_from
0,101,\t Bayesian Methods for Nonlinear Classificati...,94.95,\t (last thursday),2017-12-26
1,102,\t (next year) in Review (preorder),21.95,\t (next year),2019-01-01
2,103,\t Learn Python in Ten Minutes,2.15,(three months ago),2017-10-03
3,104,\t sports almanac (1999-2049),3.38,\t (2 years ago),2016-01-02
4,105,\t finance for dummies,84.99,\t (1 year ago),2017-01-01


In [11]:
product.columns = [i.strip('\t') for i in product.columns.tolist()]

In [12]:
product = product.drop(columns='available_from_text')

In [13]:
product['name'] = product['name'].apply(lambda x: x.strip('\t'))

In [14]:
product

Unnamed: 0,product_id,name,rrp,available_from
0,101,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26
1,102,(next year) in Review (preorder),21.95,2019-01-01
2,103,Learn Python in Ten Minutes,2.15,2017-10-03
3,104,sports almanac (1999-2049),3.38,2016-01-02
4,105,finance for dummies,84.99,2017-01-01


Insert product into database

In [15]:
product.to_sql(con=engine, name="product", if_exists='append', index=False)

2018-01-01 12:16:18,845 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("product")
2018-01-01 12:16:18,853 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:18,858 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2018-01-01 12:16:18,863 INFO sqlalchemy.engine.base.Engine INSERT INTO product (product_id, name, rrp, available_from) VALUES (?, ?, ?, ?)
2018-01-01 12:16:18,865 INFO sqlalchemy.engine.base.Engine ((101, ' Bayesian Methods for Nonlinear Classification and Regression', 94.95, '2017-12-26'), (102, ' (next year) in Review (preorder)', 21.95, '2019-01-01'), (103, ' Learn Python in Ten Minutes', 2.15, '2017-10-03'), (104, ' sports almanac (1999-2049)', 3.38, '2016-01-02'), (105, ' finance for dummies', 84.99, '2017-01-01'))
2018-01-01 12:16:18,871 INFO sqlalchemy.engine.base.Engine COMMIT


### Orders
Again need to do some cleaning

In [16]:
orders = pd.read_csv('orders.csv')

For some reason, the indexes have unwanted whitespace so strip out

In [17]:
orders.columns = [i.strip() for i in orders.columns.tolist()]

In [18]:
orders

Unnamed: 0,order_id,product_id,quantity,order_price,dispatch_date_text
0,1000,101,1,90.0,(two months ago)
1,1001,103,1,1.15,(40 days ago)
2,1002,101,10,90.0,(11 months ago)
3,1003,104,11,3.38,(6 months ago)
4,1004,105,11,501.33,(two years ago)


In [19]:
orders['dispatch_date'] = "NULL"  # Create the available_from column

In [20]:
orders.at[0, 'dispatch_date'] = (now - timedelta(days=2*30)).strftime("%Y-%m-%d")
orders.at[1, 'dispatch_date'] = (now - timedelta(days=40)).strftime("%Y-%m-%d")
orders.at[2, 'dispatch_date'] = (now - timedelta(days=11*30)).strftime("%Y-%m-%d")
orders.at[3, 'dispatch_date'] = (now - timedelta(days=6*30)).strftime("%Y-%m-%d")
orders.at[4, 'dispatch_date'] = (now - timedelta(days=2*365)).strftime("%Y-%m-%d")

In [21]:
orders = orders.drop(columns='dispatch_date_text')
orders

Unnamed: 0,order_id,product_id,quantity,order_price,dispatch_date
0,1000,101,1,90.0,2017-11-02
1,1001,103,1,1.15,2017-11-22
2,1002,101,10,90.0,2017-02-05
3,1003,104,11,3.38,2017-07-05
4,1004,105,11,501.33,2016-01-02


In [22]:
orders.to_sql(con=engine, name="orders", if_exists='append', index=False)

2018-01-01 12:16:19,089 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("orders")
2018-01-01 12:16:19,093 INFO sqlalchemy.engine.base.Engine ()
2018-01-01 12:16:19,096 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2018-01-01 12:16:19,099 INFO sqlalchemy.engine.base.Engine INSERT INTO orders (order_id, product_id, quantity, order_price, dispatch_date) VALUES (?, ?, ?, ?, ?)
2018-01-01 12:16:19,101 INFO sqlalchemy.engine.base.Engine ((1000, 101, 1, 90.0, '2017-11-02'), (1001, 103, 1, 1.15, '2017-11-22'), (1002, 101, 10, 90.0, '2017-02-05'), (1003, 104, 11, 3.38, '2017-07-05'), (1004, 105, 11, 501.33, '2016-01-02'))
2018-01-01 12:16:19,105 INFO sqlalchemy.engine.base.Engine COMMIT


### Pandas dataframe analysis
Task is to find books that have sold fewer than 10 copies in the last year, excluding books that have been available for less than a month. Create copies as dataframes and set DATE to datetime 

In [23]:
orders_df = orders # .set_index('order_id')
orders_df['dispatch_date'] = pd.to_datetime(orders_df['dispatch_date'])
product_df = product #.set_index('product_id')
product_df['available_from'] = pd.to_datetime(product_df['available_from'])

In [24]:
orders_df

Unnamed: 0,order_id,product_id,quantity,order_price,dispatch_date
0,1000,101,1,90.0,2017-11-02
1,1001,103,1,1.15,2017-11-22
2,1002,101,10,90.0,2017-02-05
3,1003,104,11,3.38,2017-07-05
4,1004,105,11,501.33,2016-01-02


In [25]:
product_df

Unnamed: 0,product_id,name,rrp,available_from
0,101,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26
1,102,(next year) in Review (preorder),21.95,2019-01-01
2,103,Learn Python in Ten Minutes,2.15,2017-10-03
3,104,sports almanac (1999-2049),3.38,2016-01-02
4,105,finance for dummies,84.99,2017-01-01


Lets join these two tables on product_id

In [26]:
joined_df = orders_df.join(product_df.set_index('product_id'), on='product_id')
joined_df

Unnamed: 0,order_id,product_id,quantity,order_price,dispatch_date,name,rrp,available_from
0,1000,101,1,90.0,2017-11-02,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26
1,1001,103,1,1.15,2017-11-22,Learn Python in Ten Minutes,2.15,2017-10-03
2,1002,101,10,90.0,2017-02-05,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26
3,1003,104,11,3.38,2017-07-05,sports almanac (1999-2049),3.38,2016-01-02
4,1004,105,11,501.33,2016-01-02,finance for dummies,84.99,2017-01-01


Only interested in last year but exclude books available for less than 30 days

In [27]:
filtered_df = joined_df[(joined_df['dispatch_date'] > now-timedelta(days=365)) & (joined_df['available_from'] < now-timedelta(days=30))]
filtered_df

Unnamed: 0,order_id,product_id,quantity,order_price,dispatch_date,name,rrp,available_from
1,1001,103,1,1.15,2017-11-22,Learn Python in Ten Minutes,2.15,2017-10-03
3,1003,104,11,3.38,2017-07-05,sports almanac (1999-2049),3.38,2016-01-02


Since we may have multiple orders in the same year for a product, lets to a groupby to be sure we sum the number sold

In [28]:
summed_orders_ds = filtered_df.groupby(['product_id'])['quantity'].sum()
summed_orders_ds

product_id
103     1
104    11
Name: quantity, dtype: int64

Finally we only want products with less than 10 orders

In [29]:
summed_orders_ds[summed_orders_ds < 10]

product_id
103    1
Name: quantity, dtype: int64

So only product 103 qualifies

### Database analysis
Lets repeat this analysis process using SQL. 
Get equivalent of suitable_products_df

In [30]:
sql = """
SELECT *
FROM product
INNER JOIN orders ON orders.product_id=product.product_id;
"""
print(sql)
pd.read_sql_query(sql, conn)


SELECT *
FROM product
INNER JOIN orders ON orders.product_id=product.product_id;

2018-01-01 12:16:19,281 INFO sqlalchemy.engine.base.Engine 
SELECT *
FROM product
INNER JOIN orders ON orders.product_id=product.product_id;

2018-01-01 12:16:19,283 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,product_id,name,rrp,available_from,order_id,product_id.1,quantity,order_price,dispatch_date
0,101,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26,1000,101,1,90.0,2017-11-02
1,103,Learn Python in Ten Minutes,2.15,2017-10-03,1001,103,1,1.15,2017-11-22
2,101,Bayesian Methods for Nonlinear Classification...,94.95,2017-12-26,1002,101,10,90.0,2017-02-05
3,104,sports almanac (1999-2049),3.38,2016-01-02,1003,104,11,3.38,2017-07-05
4,105,finance for dummies,84.99,2017-01-01,1004,105,11,501.33,2016-01-02


SOLUTION

In [31]:
sql = """
SELECT orders.product_id, SUM(orders.quantity)
FROM orders
INNER JOIN product ON orders.product_id=product.product_id
WHERE product.available_from < '2017-11-27'
AND orders.dispatch_date > '2016-12-27'
GROUP BY orders.product_id
HAVING SUM(orders.quantity) < 10
;
"""
print(sql)
pd.read_sql_query(sql, conn)


SELECT orders.product_id, SUM(orders.quantity)
FROM orders
INNER JOIN product ON orders.product_id=product.product_id
WHERE product.available_from < '2017-11-27'
AND orders.dispatch_date > '2016-12-27'
GROUP BY orders.product_id
HAVING SUM(orders.quantity) < 10
;

2018-01-01 12:16:19,312 INFO sqlalchemy.engine.base.Engine 
SELECT orders.product_id, SUM(orders.quantity)
FROM orders
INNER JOIN product ON orders.product_id=product.product_id
WHERE product.available_from < '2017-11-27'
AND orders.dispatch_date > '2016-12-27'
GROUP BY orders.product_id
HAVING SUM(orders.quantity) < 10
;

2018-01-01 12:16:19,315 INFO sqlalchemy.engine.base.Engine ()


Unnamed: 0,product_id,SUM(orders.quantity)
0,103,1
