## Connect

In [43]:
import boto3
import json

def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

creds = get_secret("wysde")
USERNAME = creds["RDS_MYSQL_USERNAME"]
PASSWORD = creds["RDS_MYSQL_PASSWORD"]
HOST = creds["RDS_MYSQL_HOST"]
DATABASE = 'store'

conn_str = 'mysql+mysqlconnector://{0}:{1}@{2}/{3}'.format(USERNAME, PASSWORD, HOST, DATABASE)

%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql
%sql {conn_str}

Exception during reset or similar
Traceback (most recent call last):
  File "/Users/sparshagarwal/anaconda3/envs/env-spacy/lib/python3.9/site-packages/mysql/connector/network.py", line 159, in send_plain
    self.sock.sendall(packet)
  File "/Users/sparshagarwal/anaconda3/envs/env-spacy/lib/python3.9/ssl.py", line 1204, in sendall
    v = self.send(byte_view[count:])
  File "/Users/sparshagarwal/anaconda3/envs/env-spacy/lib/python3.9/ssl.py", line 1173, in send
    return self._sslobj.write(data)
BrokenPipeError: [Errno 32] Broken pipe

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/sparshagarwal/anaconda3/envs/env-spacy/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 739, in _finalize_fairy
    fairy._reset(pool)
  File "/Users/sparshagarwal/anaconda3/envs/env-spacy/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 988, in _reset
    pool._dialect.do_rollback(self)
  File "/Users/sparshagarwal

## Design the Schema

![](https://user-images.githubusercontent.com/62965911/216779523-3b3e5bc8-f12a-49c0-9bca-2b5b6bc2f163.png)

## Implement the Schema

In [41]:
%%sql

CREATE TABLE `products` (
  `product_id` int(11) NOT NULL AUTO_INCREMENT,
  `name` varchar(50) NOT NULL,
  `quantity_in_stock` int(11) NOT NULL,
  `unit_price` decimal(4,2) NOT NULL,
  PRIMARY KEY (`product_id`)
);


CREATE TABLE `shippers` (
  `shipper_id` smallint(6) NOT NULL AUTO_INCREMENT,
  `name` varchar(50) NOT NULL,
  PRIMARY KEY (`shipper_id`)
);


CREATE TABLE `customers` (
  `customer_id` int(11) NOT NULL AUTO_INCREMENT,
  `first_name` varchar(50) NOT NULL,
  `last_name` varchar(50) NOT NULL,
  `birth_date` date DEFAULT NULL,
  `phone` varchar(50) DEFAULT NULL,
  `address` varchar(50) NOT NULL,
  `city` varchar(50) NOT NULL,
  `state` char(2) NOT NULL,
  `points` int(11) NOT NULL DEFAULT '0',
  PRIMARY KEY (`customer_id`)
);


CREATE TABLE `order_statuses` (
  `order_status_id` tinyint(4) NOT NULL,
  `name` varchar(50) NOT NULL,
  PRIMARY KEY (`order_status_id`)
);


CREATE TABLE `orders` (
  `order_id` int(11) NOT NULL AUTO_INCREMENT,
  `customer_id` int(11) NOT NULL,
  `order_date` date NOT NULL,
  `status` tinyint(4) NOT NULL DEFAULT '1',
  `comments` varchar(2000) DEFAULT NULL,
  `shipped_date` date DEFAULT NULL,
  `shipper_id` smallint(6) DEFAULT NULL,
  PRIMARY KEY (`order_id`),
  KEY `fk_orders_customers_idx` (`customer_id`),
  KEY `fk_orders_shippers_idx` (`shipper_id`),
  KEY `fk_orders_order_statuses_idx` (`status`),
  CONSTRAINT `fk_orders_customers` FOREIGN KEY (`customer_id`) REFERENCES `customers` (`customer_id`) ON UPDATE CASCADE,
  CONSTRAINT `fk_orders_order_statuses` FOREIGN KEY (`status`) REFERENCES `order_statuses` (`order_status_id`) ON UPDATE CASCADE,
  CONSTRAINT `fk_orders_shippers` FOREIGN KEY (`shipper_id`) REFERENCES `shippers` (`shipper_id`) ON UPDATE CASCADE
);


CREATE TABLE `order_items` (
  `order_id` int(11) NOT NULL AUTO_INCREMENT,
  `product_id` int(11) NOT NULL,
  `quantity` int(11) NOT NULL,
  `unit_price` decimal(4,2) NOT NULL,
  PRIMARY KEY (`order_id`,`product_id`),
  KEY `fk_order_items_products_idx` (`product_id`),
  CONSTRAINT `fk_order_items_orders` FOREIGN KEY (`order_id`) REFERENCES `orders` (`order_id`) ON UPDATE CASCADE,
  CONSTRAINT `fk_order_items_products` FOREIGN KEY (`product_id`) REFERENCES `products` (`product_id`) ON UPDATE CASCADE
);

## Ingest the data

In [26]:
import pandas as pd
from sqlalchemy import inspect, create_engine

In [37]:
def establish_connection(user, password, host, database):
    """establish the connection with MySQL"""
    path = 'mysql+pymysql://' + user + ':' + password + '@' + host + '/' + database
    engine = create_engine(path)
    print('Connection sucessfully established with engine', engine)
    return engine

In [38]:
engine = establish_connection(USERNAME, PASSWORD, HOST, DATABASE)

Connection sucessfully established with engine Engine(mysql+pymysql://admin:***@database-2.cy8ltogyfgas.us-east-1.rds.amazonaws.com/store)


In [39]:
def sql_table_column(table, engine_name):
    """extract the column name from MySQL table"""
    col_names = [col["name"]
                 for col in inspect(engine_name).get_columns(table)]
    print('column names are %s for table %s' % (col_names, table))
    return col_names

def transform_table(table_name, file_path, engine_name):
    """transform the table"""
    Header = sql_table_column(table_name, engine_name)
    path = file_path + table_name + ".csv"
    print("file path is ", path)
    # to read table
    #data = pd.read_table(path, sep='|', names=Header, index_col=False)
    # to read csv
    data = pd.read_csv(path, sep=',', names=Header, index_col=False)
    data = data.dropna(how='all', axis='columns')
    return data

In [42]:
# TIP: In the list, write those table name first, which has no foreign key associated
sql_table = ["products", "shippers", "customers", "order_statuses", 
             "orders", "order_items"]

path = './data/'

for table in sql_table:
    df = transform_table(table, path, engine)
    df.to_sql(table, engine, if_exists='append', index=False, index_label=False)

column names are ['product_id', 'name', 'quantity_in_stock', 'unit_price'] for table products
file path is  ./data/products.csv
column names are ['shipper_id', 'name'] for table shippers
file path is  ./data/shippers.csv


  return func(*args, **kwargs)


column names are ['customer_id', 'first_name', 'last_name', 'birth_date', 'phone', 'address', 'city', 'state', 'points'] for table customers
file path is  ./data/customers.csv
column names are ['order_status_id', 'name'] for table order_statuses
file path is  ./data/order_statuses.csv
column names are ['order_id', 'customer_id', 'order_date', 'status', 'comments', 'shipped_date', 'shipper_id'] for table orders
file path is  ./data/orders.csv
column names are ['order_id', 'product_id', 'quantity', 'unit_price'] for table order_items
file path is  ./data/order_items.csv


## De-Normalized Table

In this OLTP system, table customers is in a normalized state but include redundant data. Redundant data mean here is, customer id 1 never made any transactions. customer id 1 never ordered anything in a given period of time. Other tables such as products, shippers, and order statuses are also in a normalized state but include redundant data.

In [44]:
%%sql
DROP TABLE IF EXISTS stage_table;
CREATE TABLE stage_table (pri_key INT NOT NULL PRIMARY KEY AUTO_INCREMENT) AS
SELECT o.order_id,
    o.order_date,
    YEAR(o.order_date) AS order_year,
    MONTH(o.order_date) AS order_month,
    DAY(o.order_date) AS order_day,
    o.shipped_date,
    c.*,
    s.shipper_id,
    s.name AS shipper_name,
    oi.quantity,
    oi.unit_price AS item_unit_price,
    os.order_status_id,
    os.name AS order_status,
    p.*
FROM orders o
    JOIN customers c ON c.customer_id = o.customer_id
    JOIN shippers s ON s.shipper_id = o.shipper_id
    JOIN order_items oi ON oi.order_id = o.order_id
    JOIN order_statuses os ON os.order_status_id = o.status
    JOIN products p ON p.product_id = oi.product_id;

## Dimensional Modeling

DIM_CUSTOMER

In [45]:
%%sql
DROP TABLE IF EXISTS dim_customers;
CREATE TABLE dim_customers (
    `dim_customer_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `customer_id` INTEGER NOT NULL,
    `first_name` VARCHAR(50) NOT NULL,
    `last_name` VARCHAR(50) NOT NULL,
    `birth_date` DATE DEFAULT NULL,
    `phone` VARCHAR(50) DEFAULT NULL,
    `address` VARCHAR(50) DEFAULT NULL,
    `city` VARCHAR(50) DEFAULT NULL,
    `state` CHAR(2) DEFAULT NULL,
    `points` INTEGER DEFAULT NULL
);

INSERT INTO dim_customers (
        `customer_id`,
        `first_name`,
        `last_name`,
        `birth_date`,
        `phone`,
        `address`,
        `city`,
        `state`,
        `points`
    )
SELECT DISTINCT st.customer_id,
    st.first_name,
    st.last_name,
    st.birth_date,
    st.phone,
    st.address,
    st.city,
    st.state,
    st.points
FROM stage_table st;

DIM_ORDERS

In [46]:
%%sql
DROP TABLE IF EXISTS dim_order_statuses;
CREATE TABLE dim_order_statuses (
    `dim_order_statuses_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `order_status_id` INTEGER NOT NULL,
    `name` TEXT DEFAULT NULL
);

In [47]:
%%sql
INSERT INTO dim_order_statuses (`order_status_id`, `name`)
SELECT DISTINCT st.order_status_id,
    st.order_status
FROM stage_table st;

DIM_ORDER_DATE

In [48]:
%%sql
DROP TABLE IF EXISTS dim_order_date;
CREATE TABLE dim_order_date (
    `dim_order_date_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `order_date` DATE NOT NULL,
    `order_year` SMALLINT NOT NULL,
    `order_month` SMALLINT NOT NULL,
    `order_day` SMALLINT NOT NULL
);

In [49]:
%%sql
INSERT INTO dim_order_date (
        `order_date`,
        `order_year`,
        `order_month`,
        `order_day`
    )
SELECT DISTINCT st.order_date,
    st.order_year,
    st.order_month,
    st.order_day
FROM stage_table st;

DIM_PRODUCTS

In [50]:
%%sql
DROP TABLE IF EXISTS dim_products;
CREATE TABLE dim_products (
    `dim_product_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `product_id` INTEGER NOT NULL,
    `name` TEXT NOT NULL,
    `quantity_in_stock` INTEGER DEFAULT NULL,
    `unit_price` DECIMAL(4, 2) NOT NULL
);

In [51]:
%%sql
INSERT INTO dim_products (
        `product_id`,
        `name`,
        `quantity_in_stock`,
        `unit_price`
    )
SELECT DISTINCT st.product_id,
    st.name,
    st.quantity_in_stock,
    st.unit_price
FROM stage_table st;

DIM_SHIPPERS

In [52]:
%%sql
DROP TABLE IF EXISTS dim_shippers;
CREATE TABLE dim_shippers (
    `dim_shippers_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `shipper_id` INTEGER NOT NULL,
    `shipper_name` TEXT NOT NULL
);

In [53]:
%%sql
INSERT INTO dim_shippers (`shipper_id`, `shipper_name`)
SELECT DISTINCT st.shipper_id,
    st.shipper_name
FROM stage_table st;

FACT_SALES

In [54]:
%%sql
DROP TABLE IF EXISTS fact_sales;
CREATE TABLE fact_sales (
    `fact_sales_id` INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
    `customer_id` INTEGER NOT NULL,
    `order_status_id` INTEGER NOT NULL,
    `product_id` INTEGER NOT NULL,
    `shipper_id` INTEGER NOT NULL,
    `quantity` DECIMAL(4, 2) NOT NULL,
    `item_unit_price` DECIMAL(4, 2) NOT NULL,
    `order_date` DATE NOT NULL
);

In [55]:
%%sql
INSERT INTO fact_sales (
        `customer_id`,
        `order_status_id`,
        `product_id`,
        `shipper_id`,
        `quantity`,
        `item_unit_price`,
        `order_date`
    )
SELECT DISTINCT dc.customer_id,
    dos.order_status_id,
    dp.product_id,
    ds.shipper_id,
    st.quantity,
    st.item_unit_price,
    dod.order_date
FROM stage_table st
    JOIN dim_customers dc ON dc.customer_id = st.customer_id
    JOIN dim_order_statuses dos ON dos.order_status_id = st.order_status_id
    JOIN dim_order_date dod ON dod.order_date = st.order_date
    JOIN dim_products dp ON dp.product_id = st.product_id
    JOIN dim_shippers ds ON ds.shipper_id = st.shipper_id
ORDER BY dod.order_date;