# AdventureWorks ETL Pipeline
## By Sonika Modur
[Insert description here]

### Import Necessary Libraries

In [1]:
import os
import json
import numpy
import pandas as pd
import datetime
import certifi

import pymongo
import sqlalchemy
from sqlalchemy import create_engine, text

In [2]:
print(f"Running SQL Alchemy Version: {sqlalchemy.__version__}")
print(f"Running PyMongo Version: {pymongo.__version__}")

Running SQL Alchemy Version: 1.4.7
Running PyMongo Version: 4.10.1


### Declare and Assign Connection Variables for MySQL Server and Databases

In [3]:
src_mysql_args = {
    "uid" : "root",
    "pwd" : "PASSWORD123!",
    "hostname" : "localhost",
    "dbname" : "adventureworks"
}

dst_mysql_args = {
    "uid" : "root",
    "pwd" : "PASSWORD123!",
    "hostname" : "localhost",
    "dbname" : "adventureworks_dw"
}

mongodb_args = {
    "user_name" : "",
    "password" : "password",
    "cluster_name" : "midtermproject",
    "cluster_subnet" : "",
    "cluster_location" : "local", # "local"
    "db_name" : "adventureworks_mongodb"
}

### Declare Functions for Getting Data From and Setting Data Into Databases (MySQL)

In [4]:
def get_sql_dataframe(sql_query, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    dframe = pd.read_sql(text(sql_query), connection);
    connection.close()
    
    return dframe

def set_dataframe(df, table_name, pk_column, db_operation, **args):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{args['uid']}:{args['pwd']}@{args['hostname']}/{args['dbname']}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(text(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});"))
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

### Declare Functions for Getting Data From and Setting Data Into Databases (MongoDB)

In [5]:
def get_mongo_client(**args):
    '''Validate proper input'''
    if args["cluster_location"] not in ['atlas', 'local']:
        raise Exception("You must specify either 'atlas' or 'local' for the cluster_location parameter.")
    
    else:
        if args["cluster_location"] == "atlas":
            connect_str = f"mongodb+srv://{args['user_name']}:{args['password']}@"
            connect_str += f"{args['cluster_name']}.{args['cluster_subnet']}.mongodb.net"
            client = pymongo.MongoClient(connect_str, tlsCAFile=certifi.where())
            
        elif args["cluster_location"] == "local":
            client = pymongo.MongoClient("mongodb://localhost:27017/")
        
    return client


def get_mongo_dataframe(mongo_client, db_name, collection, query):
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = mongo_client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    mongo_client.close()
    
    return dframe


def set_mongo_collections(mongo_client, db_name, data_directory, json_files):
    db = mongo_client[db_name]
    
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(data_directory, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)
        
    mongo_client.close()

### Create the AdventureWorks Data Warehouse

In [6]:
conn_str = f"mysql+pymysql://{dst_mysql_args['uid']}:{dst_mysql_args['pwd']}@{dst_mysql_args['hostname']}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)
connection = sqlEngine.connect()

connection.execute(text(f"DROP DATABASE IF EXISTS `{dst_mysql_args['dbname']}`;"))
connection.execute(text(f"CREATE DATABASE `{dst_mysql_args['dbname']}`;"))
connection.execute(text(f"USE {dst_mysql_args['dbname']};"))

connection.close()

### ETL Process for MySQL

#### Write a SQL query for the product dimension

In [33]:
sql_products = """
SELECT
    p.ProductID,
    p.Name AS ProductName,
    p.ProductNumber,
    p.MakeFlag,
    p.FinishedGoodsFlag,
    p.Color,
    p.SafetyStockLevel,
    p.ReorderPoint,
    p.StandardCost,
    p.ListPrice,
    p.Size,
    p.SizeUnitMeasureCode,
    p.WeightUnitMeasureCode,
    p.Weight,
    p.DaysToManufacture,
    p.ProductLine,
    p.Class,
    p.Style,
    p.ProductSubcategoryID,
    p.ProductModelID,
    p.SellStartDate,
    p.SellEndDate,
    p.DiscontinuedDate,
    p.rowguid,
    p.ModifiedDate,
    psc.Name AS ProductSubcategoryName,
    pm.Name AS ProductModelName
    
FROM product p
LEFT JOIN productsubcategory psc ON p.ProductSubcategoryID = psc.ProductSubcategoryID
LEFT JOIN productmodel pm ON p.ProductModelID = pm.ProductModelID;
"""

#### Extract step: Create & populate product dimension table

In [34]:
df_products = get_sql_dataframe(sql_products, **src_mysql_args)
df_products.head(2)

Unnamed: 0,ProductID,ProductName,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate,ProductSubcategoryName,ProductModelName
0,1,Adjustable Race,AR-5381,b'\x00',b'\x00',,1000,750,0.0,0.0,...,,,,1998-06-01,NaT,,b'\xb7\x15Bi\xf7\x08\rL\xac\xb1\xd74\xbaD\xc0\...,2004-03-11 10:01:36,,
1,2,Bearing Ball,BA-8327,b'\x00',b'\x00',,1000,750,0.0,0.0,...,,,,1998-06-01,NaT,,b' <\xaeX:OIG\xa7\xd4\xd5h\x80l\xc57',2004-03-11 10:01:36,,


#### Transform step: drop, convert types, and reorder columns in product dimension table

In [35]:
# Drop redundant/irrelevant columns
df_products.drop(columns=[
    'DiscontinuedDate', # all null
    'rowguid', # useless
    'ModifiedDate', # all same value
    'ProductSubcategoryID', # redundant foreign key
    'ProductModelID' # redundant foreign key
], inplace=True)

# Reorder columns 
ordered_cols = [
    'ProductID',
    'ProductName',
    'ProductNumber',
    'MakeFlag',
    'FinishedGoodsFlag',
    'Color',
    'SafetyStockLevel',
    'ReorderPoint',
    'StandardCost',
    'ListPrice',
    'Size',
    'SizeUnitMeasureCode',
    'WeightUnitMeasureCode',
    'Weight',
    'DaysToManufacture',
    'ProductLine',
    'Class',
    'Style',
    'ProductSubcategoryName',
    'ProductModelName',
    'SellStartDate',
    'SellEndDate',
]

df_products = df_products[ordered_cols]


# Convert columns of type byte to type int 
def byte_to_int(val):
    if isinstance(val, bytes):
        return int.from_bytes(val, 'big')
    return val
df_products['MakeFlag'] = df_products['MakeFlag'].apply(byte_to_int)
df_products['FinishedGoodsFlag'] = df_products['FinishedGoodsFlag'].apply(byte_to_int)

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_products.insert(0, "ProductKey", range(1, df_products.shape[0] + 1))
df_products.head(2)

Unnamed: 0,ProductKey,ProductID,ProductName,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,...,WeightUnitMeasureCode,Weight,DaysToManufacture,ProductLine,Class,Style,ProductSubcategoryName,ProductModelName,SellStartDate,SellEndDate
0,1,1,Adjustable Race,AR-5381,0,0,,1000,750,0.0,...,,,0,,,,,,1998-06-01,NaT
1,2,2,Bearing Ball,BA-8327,0,0,,1000,750,0.0,...,,,0,,,,,,1998-06-01,NaT


#### Load step: create and populate date dimension 

Execute the `Create_Populate_Dim_Date.sql` script to create and populate a date dimension table (`date_dim`) in the AdventureWorks data warehouse. The SQL file must be located in the working directory for this step. 

#### Load step: populate product dimension

In [36]:
db_operation = "insert"
set_dataframe(df_products, table_name='dim_products', pk_column='ProductKey',db_operation=db_operation, **dst_mysql_args)

#### Verify success of ETL operations for date dimension

In [37]:
# Retrieve and display date dimension table from adventureworks data warehouse
sql_dim_date = "SELECT * FROM dim_date;"
df_dim_date = get_sql_dataframe(sql_date, **dst_mysql_args)
df_dim_date.head(2)

Unnamed: 0,date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,...,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
0,20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
1,20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


#### Verify success of ETL operations for product dimension

In [38]:
# Retrieve and display product dimension table from adventureworks data warehouse
sql_dim_products = "SELECT * FROM dim_products;"
df_dim_products = get_sql_dataframe(sql_dim_products, **dst_mysql_args)
df_dim_products.head(2)

Unnamed: 0,ProductKey,ProductID,ProductName,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,...,WeightUnitMeasureCode,Weight,DaysToManufacture,ProductLine,Class,Style,ProductSubcategoryName,ProductModelName,SellStartDate,SellEndDate
0,1,1,Adjustable Race,AR-5381,0,0,,1000,750,0.0,...,,,0,,,,,,1998-06-01,NaT
1,2,2,Bearing Ball,BA-8327,0,0,,1000,750,0.0,...,,,0,,,,,,1998-06-01,NaT


### ETL Process for MongoDB - Populate MongoDB with Source Data

#### Write an SQL query for the employee dimension

Note: did not include rowguid in Employee table since it is irrelevant. Also did not include CurrentFlag in Employee table since its value is 1 for all employees. 

In [22]:
sql_dim_employees = """
SELECT
    e.EmployeeID,
    e.NationalIDNumber,
    e.ContactID,
    e.LoginID,
    e.ManagerID,
    e.Title AS EmployeeTitle,
    e.BirthDate,
    e.MaritalStatus,
    e.Gender,
    e.HireDate,
    e.SalariedFlag,
    e.VacationHours,
    e.SickLeaveHours,
    e.ModifiedDate AS EmployeeModifiedDate,

    c.FirstName,
    c.MiddleName,
    c.LastName,
    c.EmailAddress,
    c.Phone,
    c.ModifiedDate AS ContactModifiedDate,

    ea.AddressID,
    ea.ModifiedDate AS EmployeeAddressModifiedDate,

    a.AddressLine1,
    a.AddressLine2,
    a.City,
    a.PostalCode,
    a.ModifiedDate AS AddressModifiedDate,

    edh.DepartmentID,
    edh.ShiftID,
    edh.StartDate AS DeptStartDate,
    edh.EndDate AS DeptEndDate,
    edh.ModifiedDate AS DeptHistModifiedDate,

    d.Name AS DeptName,
    d.GroupName,
    d.ModifiedDate AS DeptModifiedDate,
    
    s.Name AS ShiftName,
    s.StartTime AS ShiftStartTime,
    s.EndTime AS ShiftEndTime,
    s.ModifiedDate AS ShiftModifiedDate,

    eph.RateChangeDate,
    eph.Rate,
    eph.PayFrequency,
    eph.ModifiedDate AS PayHistModifiedDate

FROM employee e
LEFT JOIN contact c
    ON e.ContactID = c.ContactID
LEFT JOIN employeeaddress ea
    ON e.EmployeeID = ea.EmployeeID
LEFT JOIN address a
    ON ea.AddressID = a.AddressID
LEFT JOIN employeedepartmenthistory edh
    ON e.EmployeeID = edh.EmployeeID
LEFT JOIN department d
    ON edh.DepartmentID = d.DepartmentID
LEFT JOIN shift s
    ON edh.ShiftID = s.ShiftID
LEFT JOIN employeepayhistory eph
    ON e.EmployeeID = eph.EmployeeID
"""

#### Extract step: get employee data from MySQL

In [23]:
df_employee = get_sql_dataframe(sql_dim_employees, **src_mysql_args)
df_employee.head(2)

Unnamed: 0,EmployeeID,NationalIDNumber,ContactID,LoginID,ManagerID,EmployeeTitle,BirthDate,MaritalStatus,Gender,HireDate,...,GroupName,DeptModifiedDate,ShiftName,ShiftStartTime,ShiftEndTime,ShiftModifiedDate,RateChangeDate,Rate,PayFrequency,PayHistModifiedDate
0,1,14417807,1209,adventure-works\guy1,16.0,Production Technician - WC60,1972-05-15,M,M,1996-07-31,...,Manufacturing,1998-06-01,Day,1900-01-01 07:00:00,1900-01-01 15:00:00,1998-06-01,1996-07-31,12.45,1,2004-07-31
1,2,253022876,1030,adventure-works\kevin0,6.0,Marketing Assistant,1977-06-03,S,M,1997-02-26,...,Sales and Marketing,1998-06-01,Day,1900-01-01 07:00:00,1900-01-01 15:00:00,1998-06-01,1997-02-26,13.4615,2,2004-07-31


#### Validate dataframe columns for JSON conversion 
This was a troubleshooting step, since there was initially an error that some column(s) cannot be encoded into UTF-8 when converting the dataframe to JSON. Without being compatible for UTF-8 encoding, the dataframe cannot be converted into a JSON file. So, this code iterates through the columns to find which ones are problematic.

In [24]:
for col in df_employee.select_dtypes(include=['object']).columns:
    for value in df_employee[col]:
        if value is None:
            continue
        try:
            value.encode('utf-8')
        except Exception as e:
            print(f"Error in column '{col}' for value '{value}'")
            break # proceed to check next column 

Error in column 'SalariedFlag' for value 'b'\x00''


#### Transform step: convert byte values in SalariedFlag into Integers
The problematic column was found to be `SalariedFlag`, which had at least one value of type `byte`. This code iterates through the values in `SalariedFlag` and converts the byte types into integers so they are compatible with JSON.

In [25]:
def byte_to_int(val):
    if isinstance(val, bytes):
        return int.from_bytes(val, 'big')
    return val
df_employee['SalariedFlag'] = df_employee['SalariedFlag'].apply(byte_to_int)

#### Export dataframe resulting from SQL Query to JSON 

In [26]:
file = os.path.join(os.getcwd(), 'adventureworks_employee.json')
df_employee.to_json(file, orient='records')

#### Load step: Upload JSON data of employee dimension into MongoDB

In [27]:
client = get_mongo_client(**mongodb_args)
data_dir = os.getcwd()
set_mongo_collections(client, mongodb_args["db_name"], data_dir, {"employees":"adventureworks_employee.json"})      

### ETL Process for MongoDB - Create and Populate Employee Dimension Table

#### Extract step: get data from the source MongoDB collection into a DataFrame

In [28]:
client = get_mongo_client(**mongodb_args)

query = {} # Select all elements (columns), and all documents (rows).
collection = "employees"

df_employee = get_mongo_dataframe(client, mongodb_args["db_name"], collection, query)
df_employee.head(2)

Unnamed: 0,EmployeeID,NationalIDNumber,ContactID,LoginID,ManagerID,EmployeeTitle,BirthDate,MaritalStatus,Gender,HireDate,...,GroupName,DeptModifiedDate,ShiftName,ShiftStartTime,ShiftEndTime,ShiftModifiedDate,RateChangeDate,Rate,PayFrequency,PayHistModifiedDate
0,1,14417807,1209,adventure-works\guy1,16.0,Production Technician - WC60,74736000000,M,M,838771200000,...,Manufacturing,896659200000,Day,-2208963600000,-2208934800000,896659200000,838771200000,12.45,1,1091232000000
1,2,253022876,1030,adventure-works\kevin0,6.0,Marketing Assistant,234144000000,S,M,856915200000,...,Sales and Marketing,896659200000,Day,-2208963600000,-2208934800000,896659200000,856915200000,13.4615,2,1091232000000


#### Transform step: drop/rename/reorder columns in employees dimension 

In [29]:
# Drop redundant/irrelevant columns
df_employee.drop(columns=[
    'ContactID',
    'AddressID', 
    'DepartmentID', 
    'ShiftID', 
    'AddressModifiedDate',
    'DeptModifiedDate', 
    'ShiftModifiedDate'
], inplace=True)

df_employee = df_employee.rename(columns={
    'EmployeeAddressModifiedDate': 'AddressModifiedDate',
    'DeptHistModifiedDate': 'DeptModifiedDate',
})


ordered_cols = [
    "EmployeeID",
    "NationalIDNumber",
    "LoginID",
    "ManagerID",
    "EmployeeTitle",
    "BirthDate",
    "MaritalStatus",
    "Gender",
    "HireDate",
    "SalariedFlag",
    "VacationHours",
    "SickLeaveHours",
    "EmployeeModifiedDate",
    "FirstName",
    "MiddleName",
    "LastName",
    "EmailAddress",
    "Phone",
    "ContactModifiedDate",
    "AddressLine1",
    "AddressLine2",
    "City",
    "PostalCode",
    "AddressModifiedDate",
    "DeptName",
    "GroupName",
    "DeptStartDate",
    "DeptEndDate",
    "DeptModifiedDate",
    "ShiftName",
    "ShiftStartTime",
    "ShiftEndTime",
    "Rate",
    "RateChangeDate",
    "PayFrequency",
    "PayHistModifiedDate"
]

df_employee = df_employee[ordered_cols]

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_employee.insert(0, "EmployeeKey", range(1, df_employee.shape[0] + 1))
df_employee.head(2)

Unnamed: 0,EmployeeKey,EmployeeID,NationalIDNumber,LoginID,ManagerID,EmployeeTitle,BirthDate,MaritalStatus,Gender,HireDate,...,DeptStartDate,DeptEndDate,DeptModifiedDate,ShiftName,ShiftStartTime,ShiftEndTime,Rate,RateChangeDate,PayFrequency,PayHistModifiedDate
0,1,1,14417807,adventure-works\guy1,16.0,Production Technician - WC60,74736000000,M,M,838771200000,...,838771200000,,838684800000,Day,-2208963600000,-2208934800000,12.45,838771200000,1,1091232000000
1,2,2,253022876,adventure-works\kevin0,6.0,Marketing Assistant,234144000000,S,M,856915200000,...,856915200000,,856828800000,Day,-2208963600000,-2208934800000,13.4615,856915200000,2,1091232000000


#### Load step: load the transformed dataframe into AdventureWorks data warehouse by creating a new table

In [30]:
dataframe = df_employee
table_name = 'dim_employees'
primary_key = 'EmployeeKey'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **dst_mysql_args)

#### Validate that employee dimension table was created

In [31]:
sql_employees = "SELECT * FROM adventureworks_dw.dim_employees;"
df_dim_employees = get_sql_dataframe(sql_employees, **dst_mysql_args)
df_dim_employees.head(2)

Unnamed: 0,EmployeeKey,EmployeeID,NationalIDNumber,LoginID,ManagerID,EmployeeTitle,BirthDate,MaritalStatus,Gender,HireDate,...,DeptStartDate,DeptEndDate,DeptModifiedDate,ShiftName,ShiftStartTime,ShiftEndTime,Rate,RateChangeDate,PayFrequency,PayHistModifiedDate
0,1,1,14417807,adventure-works\guy1,16.0,Production Technician - WC60,74736000000,M,M,838771200000,...,838771200000,,838684800000,Day,-2208963600000,-2208934800000,12.45,838771200000,1,1091232000000
1,2,2,253022876,adventure-works\kevin0,6.0,Marketing Assistant,234144000000,S,M,856915200000,...,856915200000,,856828800000,Day,-2208963600000,-2208934800000,13.4615,856915200000,2,1091232000000


### ETL Process for CSV file

#### Write an SQL query for the vendor dimension

In [12]:
sql_dim_vendors = """
SELECT
    v.VendorID,
    v.AccountNumber,
    v.Name AS VendorName,
    v.CreditRating,
    v.PreferredVendorStatus,
    v.ActiveFlag,
    v.PurchasingWebServiceURL,
    v.ModifiedDate AS VendorModifiedDate,

    va.AddressID,
    va.AddressTypeID,
    va.ModifiedDate AS VendorAddressModifiedDate,

    a.AddressLine1,
    a.AddressLine2,
    a.City,
    a.PostalCode,
    a.ModifiedDate AS AddressModifiedDate,

    at.Name AS AddressTypeName,

    vc.ContactID,
    vc.ContactTypeID,
    vc.ModifiedDate AS VendorContactModifiedDate,

    c.FirstName,
    c.MiddleName,
    c.LastName,
    c.EmailAddress,
    c.Phone,
    c.ModifiedDate AS ContactModifiedDate,

    ct.Name AS ContactTypeName

FROM vendor v
LEFT JOIN vendoraddress va 
       ON v.VendorID = va.VendorID
LEFT JOIN address a 
       ON va.AddressID = a.AddressID
LEFT JOIN addresstype at
       ON va.AddressTypeID = at.AddressTypeID
LEFT JOIN vendorcontact vc
       ON v.VendorID = vc.VendorID
LEFT JOIN contact c
       ON vc.ContactID = c.ContactID
LEFT JOIN contacttype ct
       ON vc.ContactTypeID = ct.ContactTypeID;
"""

#### Extract step: get vendor data from MySQL

In [13]:
df_vendor = get_sql_dataframe(sql_dim_vendors, **src_mysql_args)
df_vendor.head(2)

Unnamed: 0,VendorID,AccountNumber,VendorName,CreditRating,PreferredVendorStatus,ActiveFlag,PurchasingWebServiceURL,VendorModifiedDate,AddressID,AddressTypeID,...,ContactID,ContactTypeID,VendorContactModifiedDate,FirstName,MiddleName,LastName,EmailAddress,Phone,ContactModifiedDate,ContactTypeName
0,1,INTERNAT0001,International,1,b'\x01',b'\x01',,2002-02-25,357,3,...,610,2,2002-02-25,Julia,,Moseley,julia0@adventure-works.com,432-555-0100,2002-02-25,Assistant Sales Agent
1,2,ELECTRON0002,Electronic Bike Repair & Supplies,1,b'\x01',b'\x01',,2002-02-17,335,3,...,678,2,2002-02-17,Sean,,Purcell,sean5@adventure-works.com,130-555-0100,2002-02-17,Assistant Sales Agent


#### Transform step: drop, rename, and reorder columns in vendor dimension

In [14]:
# Drop redundant/irrelevant columns
df_vendor.drop(columns=[
    'PurchasingWebServiceURL',
    'AddressID',
    'AddressTypeID',
    'AddressModifiedDate',
    'ContactID',
    'ContactTypeID',
    'ContactModifiedDate'
], inplace=True)

# Rename columns 
df_vendor = df_vendor.rename(columns={
    'VendorContactModifiedDate': 'ContactModifiedDate',
    'VendorAddressModifiedDate': 'AddressModifiedDate',
    'ConactTypeName':'ContactType'
})

# Reorder columns
ordered_cols = [
    "VendorID",
    "AccountNumber",
    "VendorName",
    "CreditRating",
    "PreferredVendorStatus",
    "ActiveFlag",
    "VendorModifiedDate",
    "AddressLine1",
    "AddressLine2",
    "City",
    "PostalCode",
    "AddressModifiedDate",
    "FirstName",
    "MiddleName",
    "LastName",
    "EmailAddress",
    "Phone",
    "ContactTypeName",
    "ContactModifiedDate"
]

df_vendor = df_vendor[ordered_cols]

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_vendor.insert(0, "VendorKey", range(1, df_vendor.shape[0] + 1))
df_vendor.head(2)

Unnamed: 0,VendorKey,VendorID,AccountNumber,VendorName,CreditRating,PreferredVendorStatus,ActiveFlag,VendorModifiedDate,AddressLine1,AddressLine2,City,PostalCode,AddressModifiedDate,FirstName,MiddleName,LastName,EmailAddress,Phone,ContactTypeName,ContactModifiedDate
0,1,1,INTERNAT0001,International,1,b'\x01',b'\x01',2002-02-25,683 Larch Ct.,,Salt Lake City,84101,2002-02-25,Julia,,Moseley,julia0@adventure-works.com,432-555-0100,Assistant Sales Agent,2002-02-25
1,2,2,ELECTRON0002,Electronic Bike Repair & Supplies,1,b'\x01',b'\x01',2002-02-17,8547 Catherine Way,,Tacoma,98403,2002-02-17,Sean,,Purcell,sean5@adventure-works.com,130-555-0100,Assistant Sales Agent,2002-02-17


#### Export data resulting from SQL query to CSV

In [15]:
df_vendor.to_csv('dim_vendors.csv', index=False)

#### Load step: Upload CSV data of vendor dimension into AdventureWorks data warehouse

In [16]:
df_dim_vendors = pd.read_csv('dim_vendors.csv')
dataframe = df_dim_vendors
table_name = 'dim_vendors'
primary_key = 'VendorKey'
db_operation = "insert"

set_dataframe(dataframe, table_name, primary_key, db_operation, **dst_mysql_args)

#### Validate that vendor dimension table was created

In [17]:
sql_vendors = "SELECT * FROM adventureworks_dw.dim_vendors;"
df_dim_verify_vendors = get_sql_dataframe(sql_vendors, **dst_mysql_args)
df_dim_verify_vendors.head(2)

Unnamed: 0,VendorKey,VendorID,AccountNumber,VendorName,CreditRating,PreferredVendorStatus,ActiveFlag,VendorModifiedDate,AddressLine1,AddressLine2,City,PostalCode,AddressModifiedDate,FirstName,MiddleName,LastName,EmailAddress,Phone,ContactTypeName,ContactModifiedDate
0,1,1,INTERNAT0001,International,1,b'\x01',b'\x01',2002-02-25,683 Larch Ct.,,Salt Lake City,84101,2002-02-25,Julia,,Moseley,julia0@adventure-works.com,432-555-0100,Assistant Sales Agent,2002-02-25
1,2,2,ELECTRON0002,Electronic Bike Repair & Supplies,1,b'\x01',b'\x01',2002-02-17,8547 Catherine Way,,Tacoma,98403,2002-02-17,Sean,,Purcell,sean5@adventure-works.com,130-555-0100,Assistant Sales Agent,2002-02-17


### Create, Populate, and Extract Fact Table (fact_purchaseorders)

#### Fill dataframes for the source tables needed to create fact_purchaseorders

In [39]:
# SELECT all columns from the adventureworks.purchaseorderdetail table to create the "df_po_detail" dataframe
sql_po_detail = "SELECT * FROM adventureworks.purchaseorderdetail;"
df_po_detail = get_sql_dataframe(sql_po_detail, **src_mysql_args)
# Display first two rows of the DataFrame to validate work
df_po_detail.head(2)

Unnamed: 0,PurchaseOrderID,PurchaseOrderDetailID,DueDate,OrderQty,ProductID,UnitPrice,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate
0,1,1,2001-05-31,4,1,50.26,201.04,3.0,0.0,3.0,2001-05-24
1,2,2,2001-05-31,3,359,45.12,135.36,3.0,0.0,3.0,2001-05-24
