## Question 1

sql done in snowflake to set up question 1:

-- Create a virtual warehouse named 'my_first_warehouse'
CREATE OR REPLACE WAREHOUSE my_first_warehouse
WITH
   WAREHOUSE_SIZE = 'XSMALL',  -- Adjust size as needed (XSMALL, SMALL, etc.)
   AUTO_SUSPEND = 300,  -- Suspend after 5 minutes of inactivity
   AUTO_RESUME = TRUE;  -- Automatically resume when a query is run

-- Use the warehouse in your session
USE WAREHOUSE my_first_warehouse;

In [26]:
import snowflake.connector
import glob
import os
import pandas as pd

# Step 1: Connect to Snowflake
conn = snowflake.connector.connect(
    user='cds006',
    password='@Triton123',
    account='qla31786.east-us-2.azure',
    warehouse='my_first_warehouse',
    database='PURCHASEORDERDATA',
    schema='PUBLIC'
)

# Create a cursor object
cs = conn.cursor()

# Step 2: Create a Stage for Data Upload
cs.execute("CREATE STAGE IF NOT EXISTS purchase_stage")
print("Stage 'purchase_stage' created or exists.")

# Step 3: Automate the Upload of CSV Files to the Stage
# Define the path where your CSV files are located
csv_files_path = r"/home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/*.csv"  # Ensure only CSV files are targeted

# Check if files are being correctly identified
csv_files = glob.glob(csv_files_path)
if not csv_files:
    print("No CSV files found in the specified directory. Please check the path:", csv_files_path)
else:
    print(f"Found {len(csv_files)} CSV files to upload.")

# Step 4: Pre-process CSV files to clean the timestamp format
for file_path in csv_files:
    if os.path.isfile(file_path):
        try:
            # Load the CSV file into a DataFrame to clean the LASTEDITEDWHEN column
            df = pd.read_csv(file_path)
            # Convert the LASTEDITEDWHEN column to a proper timestamp format
            if 'LASTEDITEDWHEN' in df.columns:
                df['LASTEDITEDWHEN'] = pd.to_datetime(df['LASTEDITEDWHEN'], errors='coerce')
            
            # Save the cleaned data back to the file
            df.to_csv(file_path, index=False)
            print(f"Cleaned timestamps in {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Iterate over all CSV files and attempt to upload them to Snowflake
for file_path in csv_files:
    if os.path.isfile(file_path):
        try:
            print(f"Attempting to upload file: {file_path}")
            # Execute PUT command to upload the file to the Snowflake stage
            cs.execute(f"PUT 'file://{file_path}' @purchase_stage auto_compress=true")
            print(f"Uploaded {file_path} to the stage successfully.")
        except Exception as e:
            print(f"Error uploading {file_path}: {e}")
    else:
        print(f"Skipping {file_path}, not a valid file.")

# Step 5: Verify files are in stage
cs.execute("LIST @purchase_stage")
stage_files = cs.fetchall()
if stage_files:
    print(f"Files in stage: {stage_files}")
else:
    print("No files found in stage. Ensure the PUT command executed correctly and files were accessible.")

# Step 6: Create the Table to Load Data
cs.execute("""
    CREATE OR REPLACE TABLE MonthlyPurchaseOrderData (
        PurchaseOrderID INTEGER,
        SupplierID INTEGER,
        OrderDate DATE,
        DeliveryMethodID INTEGER,
        ContactPersonID INTEGER,
        ExpectedDeliveryDate DATE,
        SupplierReference STRING,
        IsOrderFinalized BOOLEAN,
        Comments STRING,
        InternalComments STRING,
        LastEditedBy INTEGER,
        LastEditedWhen TIMESTAMP_NTZ,
        PurchaseOrderLineID INTEGER,
        StockItemID INTEGER,
        OrderedOuters INTEGER,
        Description STRING,
        ReceivedOuters INTEGER,
        PackageTypeID INTEGER,
        ExpectedUnitPricePerOuter FLOAT,
        LastReceiptDate DATE,
        IsOrderLineFinalized BOOLEAN,
        Right_LastEditedBy INTEGER,
        Right_LastEditedWhen TIMESTAMP_NTZ
    )
""")
print("Table 'MonthlyPurchaseOrderData' created successfully.")

# Step 7: Load Data from Stage to the Table with ON_ERROR option
cs.execute("""
    COPY INTO MonthlyPurchaseOrderData
    FROM @purchase_stage
    FILE_FORMAT = (TYPE = 'CSV', FIELD_OPTIONALLY_ENCLOSED_BY = '"', SKIP_HEADER = 1)
    ON_ERROR = 'CONTINUE';  -- This will skip rows with errors
""")
print("Data loaded into 'MonthlyPurchaseOrderData' successfully.")

# Step 8: Close the Cursor and Connection
cs.close()
conn.close()

Stage 'purchase_stage' created or exists.
Found 41 CSV files to upload.
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2013-4.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2013-1.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2015-12.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2015-10.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2013-3.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2014-3.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2014-8.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2014-7.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2013-12.csv
Cleaned timestamps in /home/jovyan/464/SQLETLSnowflake/CaseData1/Monthly PO Data/2015-1.csv
Clean

## Question 2 

sql done in snowflake for question 2:

-- Add the POAmount column to the table
ALTER TABLE MonthlyPurchaseOrderData
ADD COLUMN POAmount FLOAT;

-- Update the POAmount column with the calculated values
UPDATE MonthlyPurchaseOrderData
SET POAmount = ReceivedOuters * ExpectedUnitPricePerOuter;

-- Select a few rows to check the POAmount calculations
SELECT PurchaseOrderID, ReceivedOuters, ExpectedUnitPricePerOuter, POAmount
FROM MonthlyPurchaseOrderData
LIMIT 5;

-- Create a calculated field that shows purchase order totals for each order
SELECT 
    PurchaseOrderID, 
    SUM(POAmount) AS TotalPOAmount
FROM 
    MonthlyPurchaseOrderData
GROUP BY 
    PurchaseOrderID
ORDER BY 
    PurchaseOrderID
LIMIT 10;

## Question 3

validation to make sure code worked in snowflake sql after q3 code is run
SELECT xml_column FROM RawXMLData LIMIT 5;

SELECT x.value 
FROM RawXMLData, LATERAL FLATTEN(input => xml_column) x 
LIMIT 10;

SELECT x.value
FROM RawXMLData, 
LATERAL FLATTEN(input => xml_column) x
LIMIT 10;

SELECT * FROM SupplierInvoices LIMIT 10;

In [68]:
import snowflake.connector
import os

# Step 1: Connect to Snowflake
conn = snowflake.connector.connect(
    user='cds006',
    password='@Triton123',
    account='qla31786.east-us-2.azure',
    warehouse='my_first_warehouse',
    database='PURCHASEORDERDATA',
    schema='PUBLIC'
)

# Create a cursor object
cs = conn.cursor()

# Step 2: Create a stage for uploading XML data
cs.execute("CREATE OR REPLACE STAGE xml_stage")
print("Stage 'xml_stage' created or exists.")

# Step 3: Create an XML file format in Snowflake
cs.execute("""
    CREATE OR REPLACE FILE FORMAT xml_file_format 
    TYPE = 'XML' 
    STRIP_OUTER_ELEMENT = TRUE
""")
print("XML file format 'xml_file_format' created.")

# Step 4: Upload the XML file to the Snowflake stage
xml_file_path = "/home/jovyan/464/SQLETLSnowflake/CaseData1/Supplier Transactions XML.xml"
put_command = f"PUT 'file://{xml_file_path}' @xml_stage auto_compress=true"
cs.execute(put_command)
print("XML file uploaded to stage.")

# Step 5: Create or replace the SupplierInvoices table
cs.execute("""
    CREATE OR REPLACE TABLE SupplierInvoices (
        SupplierTransactionID INTEGER,
        SupplierID INTEGER,
        TransactionTypeID INTEGER,
        PurchaseOrderID INTEGER,
        PaymentMethodID INTEGER,
        SupplierInvoiceNumber STRING,
        TransactionDate DATE,
        AmountExcludingTax FLOAT,
        TaxAmount FLOAT,
        TransactionAmount FLOAT,
        OutstandingBalance FLOAT,
        FinalizationDate DATE,
        IsFinalized BOOLEAN,
        LastEditedBy INTEGER,
        LastEditedWhen TIMESTAMP_NTZ
    )
""")
print("Table 'SupplierInvoices' created or replaced successfully.")

# Step 6: Load and shred the XML data into the SupplierInvoices table
try:
    cs.execute("""
        COPY INTO SupplierInvoices 
        FROM @xml_stage 
        FILE_FORMAT = (TYPE = 'XML') 
        ON_ERROR = 'CONTINUE'
        PATTERN = '.*Supplier Transactions XML.xml'
    """)
    print("XML data copied to 'SupplierInvoices' successfully.")
except Exception as e:
    print(f"Error loading XML data: {e}")

# Step 7: Verify Data in SupplierInvoices
try:
    cs.execute("SELECT * FROM SupplierInvoices LIMIT 10;")
    shredded_data = cs.fetchall()
    for row in shredded_data:
        print(row)
except Exception as e:
    print(f"Error retrieving data from 'SupplierInvoices': {e}")

# Step 8: Close the cursor and connection
cs.close()
conn.close()

Stage 'xml_stage' created or exists.
XML file format 'xml_file_format' created.
XML file uploaded to stage.
Table 'SupplierInvoices' created or replaced successfully.
Error loading XML data: 002019 (0A000): SQL compilation error:
XML file format can produce one and only one column of type variant, object, or array. Load data into separate columns using copy with transformation.
