## Add a row to products

In [1]:
%load_ext sql

In [2]:
import sqlalchemy

In [3]:
%sql postgresql://postgres:postgresql@localhost/DataWarehouseX

In [4]:
product_id ="P0755"
product_name = "test product name B(test brand B)"
category = "test category B"
subcategory = "test subcategory B"

In [8]:
%%sql

-- columns affected
INSERT INTO public.products(product_id, product_name, category, subcategory)
-- values to insert
VALUES('{product_id}', '{product_name}', '{category}', '{subcategory}')

 * postgresql://postgres:***@localhost/DataWarehouseX
1 rows affected.


[]

## ETL

+ 1er paso:
+ 2do paso:
+ 3er paso

In [9]:
import pandas as pd
import re
import sqlalchemy
import pandas.io.sql as sqlio
import os
import psycopg2
import numpy as np
import psycopg2.extras as extras
from io import StringIO

########################################################################################
"""
Here you want to change your database, username & password according to your own values
"""
param_dic = {
    "host"      : "localhost",
    "database"  : "DataWarehouseX",
    "user"      : "postgres",
    "password"  : "postgresql"
}

########################################################################################
"""
Function to conecct the database
"""
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn
conn = connect(param_dic)

########################################################################################
"""
Staging Layer

1st Step
"""
def truncate(conn, table):
    """
    Using cursor.execute() to insert the dataframe
    """
    # query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s) ON CONFLICT DO NOTHING" % (table)
    query = "TRUNCATE TABLE %s" % table
    cursor = conn.cursor()
    try:
        cursor.execute(query)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("truncate done")
    cursor.close()

truncate(conn, '"Staging".dim_product')

"""
Staging Layer

2nd Step
"""
def stage(conn):
    """
    Using cursor.executemany() to insert the dataframe into Staging layer
    """
    query = 'INSERT INTO "Staging".dim_product(product_id, product_name, category, subcategory) SELECT products.* FROM products;'
    cursor = conn.cursor()
    try:
        cursor.execute(query)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("stage() done")
    cursor.close()
    
stage(conn)

##################################################################################################
"""
Transform
"""
def get_stage(conn):
    """
    Using cursor.execute() to insert the dataframe
    """
    # query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s) ON CONFLICT DO NOTHING" % (table)
    # query = "TRUNCATE TABLE %s" % table
    query = 'SELECT * FROM "Staging".dim_product'
    cursor = conn.cursor()
    try:
        cursor.execute(query)
        data = sqlio.read_sql_query(query, conn)
        conn.commit()
        return data
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("truncate done")
    
    cursor.close()
    
get_stage(conn)
dim_product = get_stage(conn)
dim_product_00 = dim_product.copy()

"""    
Eliminate white spaces and tabs in the product name
"""
dim_product_00['product_name'] = dim_product_00.apply(lambda row : row[2].strip('\t'), axis=1)
dim_product_00['product_name'] = dim_product_00.apply(lambda row : row[2].replace('\t', ''), axis=1)


"""
Define some functions
"""

def split_brand(x):
    try:
        return x[2].split('(', 1)[1].split(')')[0]
    except:
        return 'none'
    
def split_product_name(x):
    try:
        return re.split(r'[(]',x[2])[0]
    except:
        return 'none'


"""    
Apply functions to dataframe
"""

dim_product_00['brand'] = dim_product_00.apply(lambda row : split_brand(row), axis=1)
dim_product_00['product_name'] = dim_product_00.apply(lambda row : split_product_name(row), axis=1)

###########################################################################################
"""
Core Layer
"""

def update_many(conn, df, table):
    """
    Using cursor.executemany() to insert the dataframe
    """
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s) ON CONFLICT DO NOTHING" % (table, cols)
    cursor = conn.cursor()
    try:
        cursor.executemany(query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_many() done")
    cursor.close()
    
update_many(conn, dim_product_00, 'core.dim_product')

Connecting to the PostgreSQL database...
Connection successful
truncate done
stage() done
execute_many() done


