# Database Setup

- Connect to database
- Create tables and schema
- Insert table data from csv files

## 0 | Import libraries

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine # allows table data to be written from pandas to SQL

## 1 | Connect to database

#### Load iPython-SQL module

In [2]:
%load_ext sql

### Connect to database

In [3]:
%sql postgresql://postgres:12345@localhost/ecomm_cleanse

## 2 | Create tables

In [4]:
%%sql 

-- Create a table for each csv file in ../data/raw folder
DROP TABLE IF EXISTS all_sessions CASCADE;
CREATE TABLE IF NOT EXISTS all_sessions (
    fullvisitorid TEXT,
    channelgrouping TEXT,
    time TEXT,
    country TEXT,
    city TEXT,
    totaltransactionrevenue TEXT,
    transactions TEXT,
    timeonsite TEXT,
    pageviews TEXT,
    sessionqualitydim TEXT,
    date DATE,
    visitid TEXT,
    type TEXT,
    productrefundamount REAL,
    productquantity TEXT,
    productprice TEXT,
    productrevenue TEXT,
    productsku TEXT,
    v2productname TEXT,
    v2productcategory TEXT,
    productvariant TEXT,
    currencycode TEXT,
    itemquantity TEXT,
    itemrevenue TEXT,
    transactionrevenue TEXT,
    transactionid TEXT,
    pagetitle TEXT,
    searchkeyword TEXT,
    pagepathlevel1 TEXT,
    ecommerceactiontype TEXT,
    ecommerceactionstep TEXT,
    ecommerceactionoption TEXT
);

DROP TABLE IF EXISTS analytics CASCADE;
CREATE TABLE IF NOT EXISTS analytics (
    visitnumber TEXT, 
    visitid TEXT, 
    visitstarttime TEXT, 
    date DATE, 
    fullvisitorid TEXT, 
    userid TEXT, 
    channelgrouping TEXT,
    socialengagementtype TEXT,
    unitssold TEXT,
    pageviews TEXT,
    timeonsite TEXT,
    bounces TEXT,
    revenue TEXT,
    unitprice TEXT
);

DROP TABLE IF EXISTS sales_report CASCADE;
CREATE TABLE IF NOT EXISTS sales_report (
    productsku TEXT,
    totalordered TEXT,
    name TEXT,
    stocklevel TEXT,
    restockingleadtime TEXT,
    sentimentscore TEXT,
    sentimentmagnitude TEXT,
    ratio TEXT
);

DROP TABLE IF EXISTS sales_by_sku CASCADE;
CREATE TABLE IF NOT EXISTS sales_by_sku (
    productsku TEXT,
    totalordered TEXT
);

DROP TABLE IF EXISTS products CASCADE;
CREATE TABLE IF NOT EXISTS products (
    sku TEXT,
    name TEXT,
    orderedquantity TEXT,
    stocklevel TEXT,
    restockingleadtime TEXT,
    sentimentscore TEXT,
    sentimentmagnitude TEXT
);

 * postgresql://postgres:***@localhost/ecomm_cleanse
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## 3 | Import data into tables

### Import raw csv data as DataFrame

In [5]:
all_sessions = pd.read_csv('../data/raw/all_sessions.csv', dtype=str)
analytics = pd.read_csv('../data/raw/analytics.csv', dtype=str)
products = pd.read_csv('../data/raw/products.csv', dtype=str)
sales_by_sku = pd.read_csv('../data/raw/sales_by_sku.csv', dtype=str)
sales_report = pd.read_csv('../data/raw/sales_report.csv', dtype=str)

In [6]:
all_sessions.shape, analytics.shape, products.shape, sales_by_sku.shape, sales_report.shape

((15134, 32), (4301122, 14), (1092, 7), (462, 2), (454, 8))

### Fill database tables using dataframes

In [7]:

# Create a connection to the PostgreSQL database
alchemyEngine = create_engine('postgresql://postgres:12345@localhost/ecomm_cleanse')
dbConnection = alchemyEngine.connect();

# list of tables and dataframes to be written to the PostgreSQL database
table_names = ['all_sessions', 'analytics', 'products', 'sales_by_sku', 'sales_report']
dataframes = [all_sessions, analytics, products, sales_by_sku, sales_report]

# Loop through each table name and corresponding DataFrame
for table_name, dataframe in zip(table_names, dataframes):
    # Write the data to the PostgreSQL database
    dataframe.to_sql(table_name, dbConnection, if_exists='replace', index=False, schema='public')

dbConnection.close()

### Test query for each table

In [18]:
%sql SELECT * FROM all_sessions LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


fullvisitorid,channelgrouping,time,country,city,totaltransactionrevenue,transactions,timeonsite,pageviews,sessionqualitydim,date,visitid,type,productrefundamount,productquantity,productprice,productrevenue,productsku,v2productname,v2productcategory,productvariant,currencycode,itemquantity,itemrevenue,transactionrevenue,transactionid,pagetitle,searchkeyword,pagepathlevel1,ecommerceactiontype,ecommerceactionstep,ecommerceactionoption
2817722496551184128,Direct,122213,Taiwan,(not set),,,142,7,,20160913,1473757371,PAGE,,,2990000,,GGOEGAAX0074,Google 22 oz Water Bottle,Home/Drinkware/,(not set),USD,,,,,Drinkware,,/google+redesign/,0,1,


In [19]:
%sql SELECT * FROM analytics LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


visitnumber,visitid,visitstarttime,date,fullvisitorid,userid,channelgrouping,socialengagementtype,unitssold,pageviews,timeonsite,bounces,revenue,unitprice
7,1498424366,1498424366,20170625,9444016982622091039,,Display,Not Socially Engaged,,1,,1,,8990000


In [20]:
%sql SELECT * FROM products LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


sku,name,orderedquantity,stocklevel,restockingleadtime,sentimentscore,sentimentmagnitude
GGADFBSBKS42347,PC gaming speakers,0,100,1,,


In [23]:
%sql SELECT * FROM sales_by_sku LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


productsku,totalordered
GGOEGOAQ012899,456


In [22]:
%sql SELECT * FROM sales_report LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


productsku,totalordered,name,stocklevel,restockingleadtime,sentimentscore,sentimentmagnitude,ratio
GGOEGAAX0081,42,Recycled Paper Journal Set,740,5,0.3,0.5,0.0567567567567567


## 4 | Create backup tables

- this cleaning and analysis will involve many future table alterations, so lets create backups of each of the tables in their original form, for easy restoration in the event of a mistaken irreversable alteration

In [24]:
%%sql
DROP TABLE IF EXISTS all_sessions_backup;
CREATE TABLE IF NOT EXISTS all_sessions_backup AS SELECT * FROM all_sessions;

DROP TABLE IF EXISTS analytics_backup;
CREATE TABLE IF NOT EXISTS analytics_backup AS SELECT * FROM analytics;

DROP TABLE IF EXISTS products_backup;
CREATE TABLE IF NOT EXISTS products_backup AS SELECT * FROM products;

DROP TABLE IF EXISTS sales_by_sku_backup;
CREATE TABLE IF NOT EXISTS sales_by_sku_backup AS SELECT * FROM sales_by_sku;

DROP TABLE IF EXISTS sales_report_backup;
CREATE TABLE IF NOT EXISTS sales_report_backup AS SELECT * FROM sales_report;

 * postgresql://postgres:***@localhost/ecomm_cleanse
Done.
15134 rows affected.
Done.
4301122 rows affected.
Done.
1092 rows affected.
Done.
462 rows affected.
Done.
454 rows affected.


[]

## 4 | Conclusion + next steps

- Database created, connection established
- Tables created and filled with csv data

Continue to [02_data_cleaning.ipynb](./02_data_cleaning.ipynb)

### 