In [115]:
# pip install --upgrade snowflake-connector-python
# pip install --upgrade snowflake-sqlalchemy



In [23]:
import os
import snowflake.connector
import glob

In [129]:
# Connect to Snowflake
conn = snowflake.connector.connect(
    user='wbkennedy',
    password='Brandon92!',
    account='nulgoll-hsb06466',
    warehouse='final_project_warehouse',
    database='final_project_db',
    schema='final_project_schema'
)

cs = conn.cursor()

In [27]:
#create a virtual warehouse (virtual warehouses contain the compute resources that are required to perform queries and DML operations with Snowflake)
cs.execute("CREATE WAREHOUSE IF NOT EXISTS final_project_warehouse")

<snowflake.connector.cursor.SnowflakeCursor at 0x7f9336fbdd10>

In [9]:
# create a database
cs.execute("CREATE DATABASE IF NOT EXISTS final_project_db")

<snowflake.connector.cursor.SnowflakeCursor at 0x7f93376a0e90>

In [10]:
# create schema
cs.execute("CREATE SCHEMA IF NOT EXISTS final_project_schema")

<snowflake.connector.cursor.SnowflakeCursor at 0x7f93376a0e90>

In [11]:
# Create a Snowflake stage to hold the files
stage_name = "purchases_stage"
cs.execute(f"CREATE OR REPLACE STAGE {stage_name}")

<snowflake.connector.cursor.SnowflakeCursor at 0x7f93376a0e90>

In [21]:
# Folder containing the CSV files (your provided path)
csv_folder_path = '/home/jovyan/MGTA_464/data/Data_unzipped/Data/Monthly_PO_Data'

In [39]:
# Use glob to iterate through all CSV files in the directory and upload them to the stage
for file_path in glob.glob(os.path.join(csv_folder_path, "*.csv")):
    file_name = os.path.basename(file_path)
    
    # Upload the file to the Snowflake stage
    put_query = f"PUT file://{file_path} @{stage_name}/{file_name}"
    cs.execute(put_query)
    print(f"Uploaded {file_name} to Snowflake stage")

# Create a target table in Snowflake with proper data types
cs.execute("""
    CREATE OR REPLACE TABLE purchases_data (
        PurchaseOrderID INTEGER,
        SupplierID INTEGER,
        OrderDate DATE,
        DeliveryMethodID INTEGER,
        ContactPersonID INTEGER,
        ExpectedDeliveryDate DATE,
        SupplierReference STRING,
        IsOrderFinalized BOOLEAN,
        Comments STRING,
        InternalComments STRING,
        LastEditedBy INTEGER,
        LastEditedWhen TIMESTAMP,
        PurchaseOrderLineID INTEGER,
        StockItemID INTEGER,
        OrderedOuters INTEGER,
        Description STRING,
        ReceivedOuters INTEGER,
        PackageTypeID INTEGER,
        ExpectedUnitPricePerOuter FLOAT,
        LastReceiptDate DATE,
        IsOrderLineFinalized BOOLEAN,
        Right_LastEditedBy INTEGER,
        Right_LastEditedWhen TIMESTAMP
    );
""")
print("Created table purchases_data")

# Use COPY INTO to move data from the stage to the Snowflake table, with correct formatting and error handling
copy_into_query = f"""
COPY INTO purchases_data
FROM @{stage_name}
FILE_FORMAT = (
    TYPE = 'CSV',
    FIELD_OPTIONALLY_ENCLOSED_BY = '"',
    SKIP_HEADER = 1,
    DATE_FORMAT = 'MM/DD/YYYY',
    TIMESTAMP_FORMAT = 'MM/DD/YYYY HH24:MI'
)
ON_ERROR = 'CONTINUE';
"""
cs.execute(copy_into_query)
print("Data copied from stage to purchases_data table")

# Optionally, you can remove the files from the stage after loading them to the table
cs.execute(f"REMOVE @{stage_name}")
print(f"Removed files from stage {stage_name}")

Uploaded 2020-2.csv to Snowflake stage
Uploaded 2021-3.csv to Snowflake stage
Uploaded 2020-5.csv to Snowflake stage
Uploaded 2021-11.csv to Snowflake stage
Uploaded 2022-1.csv to Snowflake stage
Uploaded 2019-8.csv to Snowflake stage
Uploaded 2021-7.csv to Snowflake stage
Uploaded 2021-10.csv to Snowflake stage
Uploaded 2019-6.csv to Snowflake stage
Uploaded 2021-8.csv to Snowflake stage
Uploaded 2020-10.csv to Snowflake stage
Uploaded 2020-8.csv to Snowflake stage
Uploaded 2021-1.csv to Snowflake stage
Uploaded 2019-4.csv to Snowflake stage
Uploaded 2019-12.csv to Snowflake stage
Uploaded 2022-4.csv to Snowflake stage
Uploaded 2020-11.csv to Snowflake stage
Uploaded 2019-10.csv to Snowflake stage
Uploaded 2020-12.csv to Snowflake stage
Uploaded 2020-3.csv to Snowflake stage
Uploaded 2020-1.csv to Snowflake stage
Uploaded 2019-2.csv to Snowflake stage
Uploaded 2019-11.csv to Snowflake stage
Uploaded 2021-2.csv to Snowflake stage
Uploaded 2019-3.csv to Snowflake stage
Uploaded 2019-9.c

In [None]:
# Create a cursor object
cs = conn.cursor()

# Add the POAmount column
cs.execute("""
    ALTER TABLE purchases_data
    ADD COLUMN POAmount FLOAT;
""")
print("Added POAmount column")

In [41]:
# Update the table to calculate POAmount
cs.execute("""
    UPDATE purchases_data
    SET POAmount = ReceivedOuters * ExpectedUnitPricePerOuter;
""")
print("Updated POAmount values")

Updated POAmount values


In [91]:
# Step 2: Create a Stage for XML File
cs.execute("CREATE OR REPLACE STAGE supplier_stage")
print("Stage created")

# Step 3: Upload XML File into the Stage
cs.execute("PUT file:///home/jovyan/MGTA_464/data/Data_unzipped/Data/SupplierTransactionsXML.xml @supplier_stage")
print("File uploaded to Snowflake stage")

# Step 4: Create Raw Table to Hold XML Data
cs.execute("""
    CREATE OR REPLACE TABLE supplier_transactions_raw (xml_data VARIANT)
""")
print("Raw table created")

# Step 5: Copy XML into Raw Table
cs.execute("""
    COPY INTO supplier_transactions_raw
    FROM @supplier_stage FILE_FORMAT = (TYPE = 'XML')
""")
print("XML data loaded into raw table")

# Step 6: Create the Final Table for Shredded Data
cs.execute("""
    CREATE OR REPLACE TABLE supplier_transactions (
        SupplierTransactionID STRING, SupplierID STRING, TransactionTypeID STRING,
        PurchaseOrderID STRING, PaymentMethodID STRING, SupplierInvoiceNumber STRING,
        TransactionDate DATE, AmountExcludingTax FLOAT, TaxAmount FLOAT,
        TransactionAmount FLOAT, OutstandingBalance FLOAT, FinalizationDate DATE,
        IsFinalized BOOLEAN, LastEditedBy STRING, LastEditedWhen TIMESTAMP
    )
""")
print("Final table created")

# Step 7: Shred XML Data Using LATERAL FLATTEN in Snowflake
cs.execute("""
    INSERT INTO supplier_transactions
    SELECT 
        value:SupplierTransactionID::STRING AS SupplierTransactionID,
        value:SupplierID::STRING AS SupplierID,
        value:TransactionTypeID::STRING AS TransactionTypeID,
        value:PurchaseOrderID::STRING AS PurchaseOrderID,
        value:PaymentMethodID::STRING AS PaymentMethodID,
        value:SupplierInvoiceNumber::STRING AS SupplierInvoiceNumber,
        TO_DATE(value:TransactionDate::STRING, 'YYYY-MM-DD') AS TransactionDate,
        value:AmountExcludingTax::FLOAT AS AmountExcludingTax,
        value:TaxAmount::FLOAT AS TaxAmount,
        value:TransactionAmount::FLOAT AS TransactionAmount,
        value:OutstandingBalance::FLOAT AS OutstandingBalance,
        TO_DATE(value:FinalizationDate::STRING, 'YYYY-MM-DD') AS FinalizationDate,
        value:IsFinalized::BOOLEAN AS IsFinalized,
        value:LastEditedBy::STRING AS LastEditedBy,
        TO_TIMESTAMP(value:LastEditedWhen::STRING) AS LastEditedWhen
    FROM supplier_transactions_raw,
    LATERAL FLATTEN(input => supplier_transactions_raw.xml_data:"root"."row")
""")
print("Data shredded and inserted into the final table")

Stage created
File uploaded to Snowflake stage
Raw table created
XML data loaded into raw table
Final table created
Data shredded and inserted into the final table


In [95]:
cs.execute("""
    SELECT 
        value:SupplierTransactionID::STRING AS SupplierTransactionID,
        value:SupplierID::STRING AS SupplierID,
        value:TransactionTypeID::STRING AS TransactionTypeID
    FROM supplier_transactions_raw,
    LATERAL FLATTEN(input => supplier_transactions_raw.xml_data:"root"."row")
    LIMIT 10;
""")
rows = cs.fetchall()
for row in rows:
    print(row)


In [None]:
# # query the table
# cs.execute('SELECT * FROM purchases_data LIMIT 10')
# print(cs.fetchmany(2))

In [None]:
# cs.execute('SELECT * FROM purchases_data LIMIT 100')
# result = cs.fetchall()
# for row in result:
#     print(row)


In [None]:
# # to use a different warehouse
# cs.execute("USE WAREHOUSE different_warehouse_mg")

In [11]:
# # create table and insert data
# cs.execute("CREATE OR REPLACE TABLE test_table(col1 integer, col2 string)")
# cs.execute("INSERT INTO test_table(col1, col2) VALUES (123, 'test string1'), (456, 'test string2')")


<snowflake.connector.cursor.SnowflakeCursor at 0x7f11b7eca410>

In [99]:
import xml.etree.ElementTree as ET
import csv

# Parse the XML file
tree = ET.parse('/home/jovyan/MGTA_464/data/Data_unzipped/Data/SupplierTransactionsXML.xml')
root = tree.getroot()

# Create CSV file
with open('SupplierTransactions.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        'SupplierTransactionID', 'SupplierID', 'TransactionTypeID', 'PurchaseOrderID', 
        'PaymentMethodID', 'SupplierInvoiceNumber', 'TransactionDate', 
        'AmountExcludingTax', 'TaxAmount', 'TransactionAmount', 
        'OutstandingBalance', 'FinalizationDate', 'IsFinalized', 
        'LastEditedBy', 'LastEditedWhen'
    ])

    for row in root.findall('row'):
        writer.writerow([
            row.find('SupplierTransactionID').text,
            row.find('SupplierID').text,
            row.find('TransactionTypeID').text,
            row.find('PurchaseOrderID').text,
            row.find('PaymentMethodID').text,
            row.find('SupplierInvoiceNumber').text,
            row.find('TransactionDate').text,
            row.find('AmountExcludingTax').text,
            row.find('TaxAmount').text,
            row.find('TransactionAmount').text,
            row.find('OutstandingBalance').text,
            row.find('FinalizationDate').text,
            row.find('IsFinalized').text,
            row.find('LastEditedBy').text,
            row.find('LastEditedWhen').text
        ])


In [100]:
cs.execute("PUT file://SupplierTransactions.csv @supplier_stage")  # For CSV
cs.execute("""
    COPY INTO supplier_transactions
    FROM @supplier_stage/SupplierTransactions.csv
    FILE_FORMAT = (TYPE = 'CSV', SKIP_HEADER = 1);
""")
print("Data loaded from CSV file")



Data loaded from CSV file


In [101]:
# Query the data from supplier_transactions
cs.execute("SELECT * FROM supplier_transactions LIMIT 10")
rows = cs.fetchall()

# Print the results
for row in rows:
    print(row)


('134', '2', '5', '1', '4', '7290', datetime.date(2019, 1, 2), 313.5, 47.03, 360.53, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 1, 7, 9, 0))
('169', '4', '5', '2', '4', '3898', datetime.date(2019, 1, 2), 21732.0, 3259.8, 24991.8, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 1, 7, 9, 0))
('186', '5', '5', '3', '4', '616', datetime.date(2019, 1, 2), 2740.5, 411.11, 3151.61, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 1, 7, 9, 0))
('215', '7', '5', '4', '4', '3869', datetime.date(2019, 1, 2), 42481.2, 6372.19, 48853.39, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 1, 7, 9, 0))
('224', '10', '5', '5', '4', '4697', datetime.date(2019, 1, 2), 35067.5, 5260.14, 40327.64, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 1, 7, 9, 0))
('230', '12', '5', '6', '4', '1375', datetime.date(2019, 1, 2), 5528.5, 829.28, 6357.78, 0.0, datetime.date(2019, 1, 7), True, '4', datetime.datetime(2019, 

In [123]:
import psycopg2
import csv

def export_postgres_to_csv(db_conn_str, query, output_file):
    conn = psycopg2.connect(db_conn_str)
    cursor = conn.cursor()
    
    with open(output_file, 'w') as f:
        cursor.copy_expert(f"COPY ({query}) TO STDOUT WITH CSV HEADER", f)
    
    cursor.close()
    conn.close()
    print(f"Data exported to {output_file}")

# Connection string to your Postgres database
postgres_conn_str = "dbname='WestCoastImporters' user='jovyan' host='127.0.0.1' port='8765' password='postgres'"

# SQL query to extract the data from the table
query = "SELECT * FROM supplier_case"

# Export data to CSV
export_postgres_to_csv(postgres_conn_str, query, "supplier_case.csv")


Data exported to supplier_case.csv


In [124]:
# Upload the CSV file to the Snowflake stage
stage_name = "supplier_stage"
csv_file = "supplier_case.csv"

cs.execute(f"PUT file://{csv_file} @{stage_name}")
print(f"Uploaded {csv_file} to Snowflake stage")


Uploaded supplier_case.csv to Snowflake stage


In [125]:
# Create the table before loading data
cs.execute("""
    CREATE OR REPLACE TABLE supplier_case (
        supplierid STRING,
        suppliername STRING,
        suppliercategoryid STRING,
        primarycontactpersonid STRING,
        alternatecontactpersonid STRING,
        deliverymethodid STRING,
        postalcityid STRING,
        supplierreference STRING,
        bankaccountname STRING,
        bankaccountbranch STRING,
        bankaccountcode STRING,
        bankaccountnumber STRING,
        bankinternationalcode STRING,
        paymentdays INT,
        internalcomments STRING,
        phonenumber STRING,
        faxnumber STRING,
        websiteurl STRING,
        deliveryaddressline1 STRING,
        deliveryaddressline2 STRING,
        deliverypostalcode STRING,
        deliverylocation STRING,
        postaladdressline1 STRING,
        postaladdressline2 STRING,
        postalpostalcode STRING,
        lasteditedby STRING,
        validfrom STRING,
        validto STRING
    )
""")

print("Created supplier_case table in Snowflake")

Created supplier_case table in Snowflake


In [127]:
# Define a file format with proper handling for enclosed fields
cs.execute("""
    CREATE OR REPLACE FILE FORMAT my_csv_format
    TYPE = 'CSV'
    FIELD_OPTIONALLY_ENCLOSED_BY = '"'
    SKIP_HEADER = 1
    NULL_IF = ('');
""")

# Copy data from the Snowflake stage to the table using the custom file format
cs.execute(f"""
    COPY INTO supplier_case
    FROM @{stage_name}/{csv_file.split('/')[-1]}
    FILE_FORMAT = my_csv_format;
""")
print("Data loaded into the supplier_case table")

Data loaded into the supplier_case table


In [None]:
cs.execute("""
    CREATE OR REPLACE TABLE purchase_orders_and_invoices AS
    SELECT 
    
        (ST.AmountExcludingTax - PD.POAmount) AS invoiced_vs_quoted,
    FROM PURCHASES_DATA PD
    INNER JOIN SUPPLIER_TRANSACTIONS ST
    ON PD.PurchaseOrderID = ST.PurchaseOrderID
""")
conn.commit()
print("Table 'purchase_orders_and_invoices' created.")
