# Database Setup

- Connect to database
- Create tables and schema
- Insert table data from csv files

## 0 | Import libraries

In [7]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine # allows table data to be written from pandas to SQL

## 1 | Connect to database

#### Load iPython-SQL module

In [1]:
%load_ext sql

### Connect to database

In [2]:
%sql postgresql://postgres:12345@localhost/ecomm_cleanse

## 2 | Create tables

In [None]:
%%sql 

-- Create a table for each csv file in ../data/raw folder
DROP TABLE IF EXISTS all_sessions CASCADE;
CREATE TABLE IF NOT EXISTS all_sessions (
    fullvisitorid TEXT,
    channelgrouping TEXT,
    time TEXT,
    country TEXT,
    city TEXT,
    totaltransactionrevenue TEXT,
    transactions TEXT,
    timeonsite TEXT,
    pageviews TEXT,
    sessionqualitydim TEXT,
    date DATE,
    visitid TEXT,
    type TEXT,
    productrefundamount REAL,
    productquantity TEXT,
    productprice TEXT,
    productrevenue TEXT,
    productsku TEXT,
    v2productname TEXT,
    v2productcategory TEXT,
    productvariant TEXT,
    currencycode TEXT,
    itemquantity TEXT,
    itemrevenue TEXT,
    transactionrevenue TEXT,
    transactionid TEXT,
    pagetitle TEXT,
    searchkeyword TEXT,
    pagepathlevel1 TEXT,
    ecommerceactiontype TEXT,
    ecommerceactionstep TEXT,
    ecommerceactionoption TEXT
);

DROP TABLE IF EXISTS analytics CASCADE;
CREATE TABLE IF NOT EXISTS analytics (
    visitnumber TEXT, 
    visitid TEXT, 
    visitstarttime TEXT, 
    date DATE, 
    fullvisitorid TEXT, 
    userid TEXT, 
    channelgrouping TEXT,
    socialengagementtype TEXT,
    unitssold TEXT,
    pageviews TEXT,
    timeonsite TEXT,
    bounces TEXT,
    revenue TEXT,
    unitprice TEXT
);

## 3 | Import data into tables

### Import raw csv data as DataFrame

In [8]:
all_sessions = pd.read_csv('../data/raw/all_sessions.csv', dtype=str)
analytics = pd.read_csv('../data/raw/analytics.csv', dtype=str)

In [9]:
all_sessions.shape, analytics.shape

((15134, 32), (4301122, 14))

### Fill database tables using dataframes

In [7]:

# Create a connection to the PostgreSQL database
alchemyEngine = create_engine('postgresql://postgres:12345@localhost/ecomm_cleanse')
dbConnection = alchemyEngine.connect();

# list of tables and dataframes to be written to the PostgreSQL database
table_names = ['all_sessions', 'analytics']
dataframes = [all_sessions, analytics]

# Loop through each table name and corresponding DataFrame
for table_name, dataframe in zip(table_names, dataframes):
    # Write the data to the PostgreSQL database
    dataframe.to_sql(table_name, dbConnection, if_exists='replace', index=False, schema='public')

dbConnection.close()

### Test query for each table

In [18]:
%sql SELECT * FROM all_sessions LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


fullvisitorid,channelgrouping,time,country,city,totaltransactionrevenue,transactions,timeonsite,pageviews,sessionqualitydim,date,visitid,type,productrefundamount,productquantity,productprice,productrevenue,productsku,v2productname,v2productcategory,productvariant,currencycode,itemquantity,itemrevenue,transactionrevenue,transactionid,pagetitle,searchkeyword,pagepathlevel1,ecommerceactiontype,ecommerceactionstep,ecommerceactionoption
2817722496551184128,Direct,122213,Taiwan,(not set),,,142,7,,20160913,1473757371,PAGE,,,2990000,,GGOEGAAX0074,Google 22 oz Water Bottle,Home/Drinkware/,(not set),USD,,,,,Drinkware,,/google+redesign/,0,1,


In [19]:
%sql SELECT * FROM analytics LIMIT 1;

 * postgresql://postgres:***@localhost/ecomm_cleanse
1 rows affected.


visitnumber,visitid,visitstarttime,date,fullvisitorid,userid,channelgrouping,socialengagementtype,unitssold,pageviews,timeonsite,bounces,revenue,unitprice
7,1498424366,1498424366,20170625,9444016982622091039,,Display,Not Socially Engaged,,1,,1,,8990000


## 4 | Create backup tables

- this cleaning and analysis will involve many future table alterations, so lets create backups of each of the tables in their original form, for easy restoration in the event of a mistaken irreversable alteration

In [None]:
%%sql
DROP TABLE IF EXISTS all_sessions_backup;
CREATE TABLE IF NOT EXISTS all_sessions_backup AS SELECT * FROM all_sessions;

DROP TABLE IF EXISTS analytics_backup;
CREATE TABLE IF NOT EXISTS analytics_backup AS SELECT * FROM analytics;


## 4 | Conclusion + next steps

- Database created, connection established
- Tables created and filled with csv data

Continue to [02_initial_data_prep.ipynb](./02_initial_data_prep.ipynb)

### 