In [1]:
import sys
sys.path.append('/path/to/utils')

In [2]:
import pandas as pd
from zipfile import ZipFile
import os
import requests
import numpy as np
from data_ingestion import DataIngestion
from datetime import datetime
from data_preprocessing import *
import psycopg2

In [3]:
# Database credentials
db_credentials = {
    'host': 'rds-endpoint',
    'port': 5432,
    'database': 'xtage_std',
    'user': 'postgres',
    'password': '******'
}


In [4]:
# Initialize DataIngestion class
xtp_con = DataIngestion(db_credentials)

Database engine created successfully


In [5]:
customers_prem = xtp_con.fetch_query_results('select * from customers')
products_prem = xtp_con.fetch_query_results('select * from products')
transactions_prem = xtp_con.fetch_query_results('select * from transactions')
sales_data_prem = xtp_con.fetch_query_results('select * from sales_data')
exchange_rates_prem = xtp_con.fetch_query_results('select * from exchange_rates')


# Data Preprocessing Pipeline

In [6]:
## Filling missing values

In [7]:
customers_prem = fill_missing_values(customers_prem, {'gender': 'unknown', 'location': 'unknown'})

In [8]:
## Handle Abnormal values

In [9]:
products_prem = handle_abnormal_values(products_prem, {
        'price': {'type': 'minrange_include', 'range': (0, np.inf)},
        'stock_available': {'type': 'range_exclude', 'range': (0, np.inf)}
    })

In [10]:
transactions_prem = handle_abnormal_values(transactions_prem, {
        'quantity': {'type': 'range_exclude', 'range': (1, np.inf)}
    })

In [11]:
sales_data_prem = handle_abnormal_values(sales_data_prem, {
        'price': {'type': 'minrange_include', 'range': (0, np.inf)},
        'quantity': {'type': 'range_exclude', 'range': (0, np.inf)}
    })

In [12]:
### Recalculating total_amount in transactions using price from products.
### cross checked the product prices within transaction data but values vary for same product.
### imputing for missing values of total_amount in transactions from product data will be in efficient.
### hence recalculating whole column.

In [13]:
transactions_prem = transactions_prem.merge(products_prem[['product_id','price']],how='left',on='product_id')

In [14]:
transactions_prem['total_amount'] = transactions_prem['price']*transactions_prem['quantity']

In [15]:
transactions_prem.drop(['price'],axis=1,inplace=True)

In [16]:
## Duplicates Check

In [17]:
exchange_rates_prem = handle_duplicates(exchange_rates_prem)

## Inserting data in new Database - This is prod database which is referred for api's

In [19]:
# Database credentials
db_credentials_create = {
    'host': 'rds-endpoint',
    'port': 5432,
    'database': 'postgres',
    'user': 'postgres',
    'password': '******'
}


In [20]:
# Initialize DataIngestion class
db_ing = DataIngestion(db_credentials_create)

Database engine created successfully


In [21]:
db_ing.create_database("xtage_cld")  # to store raw files

Database 'xtage_cld' created successfully.


## Storing processed data into tables inside xtage_cld

In [22]:
create_customers_cld_table_sql = """
CREATE TABLE IF NOT EXISTS customers (
    customer_id INT PRIMARY KEY,
    customer_name VARCHAR(255) NOT NULL,
    age INT,
    gender VARCHAR(50),
    location VARCHAR(255),
    date_joined DATE
);
"""

create_products_cld_table_sql = """
CREATE TABLE IF NOT EXISTS products (
    product_id INT PRIMARY KEY,
    product_name VARCHAR(255) NOT NULL,
    category VARCHAR(255) NOT NULL,
    price FLOAT,
    stock_available INT
);
"""

create_transactions_cld_table_sql = """
CREATE TABLE IF NOT EXISTS transactions (
    transaction_id INT PRIMARY KEY,
    customer_id INT NOT NULL,
    product_id INT NOT NULL,
    quantity INT,
    transaction_date DATE,
    total_amount FLOAT
);
"""

create_sales_cld_table_sql = """
CREATE TABLE IF NOT EXISTS sales_data (
    transaction_id INT PRIMARY KEY,
    product_id INT NOT NULL,
    quantity INT,
    price FLOAT,
    transaction_date DATE
);
"""
    
create_exchng_cld_table_sql = """
CREATE TABLE IF NOT EXISTS exchange_rates (
    currency_code VARCHAR(3) NOT NULL,
    exchange_rate FLOAT,
    date DATE
);
"""

In [23]:
# Database credentials
db_credentials = {
    'host': 'rds-endpoint',
    'port': 5432,
    'database': 'xtage_cld',
    'user': 'postgres',
    'password': '******'
}

# Initialize DataIngestion class
xtp2_con = DataIngestion(db_credentials)

Database engine created successfully


In [24]:
# Create tables
xtp2_con.execute_query(create_sales_cld_table_sql)
xtp2_con.execute_query(create_exchng_cld_table_sql)
xtp2_con.execute_query(create_customers_cld_table_sql)
xtp2_con.execute_query(create_products_cld_table_sql)
xtp2_con.execute_query(create_transactions_cld_table_sql)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.


In [25]:
xtp2_con.insert_data_from_df(df=customers_prem,table_name='customers')
xtp2_con.insert_data_from_df(df=sales_data_prem,table_name='sales_data')
xtp2_con.insert_data_from_df(df=exchange_rates_prem,table_name='exchange_rates')
xtp2_con.insert_data_from_df(df=products_prem,table_name='products')
xtp2_con.insert_data_from_df(df=transactions_prem,table_name='transactions')

Data from DataFrame inserted into customers successfully.
Data from DataFrame inserted into sales_data successfully.
Data from DataFrame inserted into exchange_rates successfully.
Data from DataFrame inserted into products successfully.
Data from DataFrame inserted into transactions successfully.
