# Load Excel Files

* Author: Dexter Stephens
* Last Updated: 11/28/2024

This notebook will load data into the `TRANSACTION`, `COMPANY` and `PRESS_AND_EARNINGS` tables from Excel files.

In [None]:
# Import python packages
import sys
import logging
import pandas as pd

logger = logging.getLogger("demo_logger")

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# Change the context if arguments have been passed to the execution of the notebook, in the format of "arg=value"
args = {
    'database': session.get_current_database(),
    'schema': session.get_current_schema()
}

for arg in sys.argv:
    if arg.find("=") != -1:
        key, value = arg.split("=", 1)
        if key in args:
            args[key] = value


session.use_schema(f"{args['database']}.{args['schema']}")

logger.info("load_excel_files start")

In [None]:
SELECT 
    '@INTEGRATIONS.FINANCIAL_DATA_RAW_STAGE/' || RELATIVE_PATH as STAGE_FILE_PATH,
    SPLIT_PART(RELATIVE_PATH, '/', 1) AS TARGET_TABLE
FROM DIRECTORY(@INTEGRATIONS.FINANCIAL_DATA_RAW_STAGE)

## Create a function to load Excel worksheet to table

Create a reusable function to load an Excel worksheet to a table in Snowflake.

In [None]:
import os
from snowflake.snowpark.files import SnowflakeFile
from openpyxl import load_workbook
from snowflake.snowpark.types import VariantType
from snowflake.snowpark.functions import col, to_variant

def load_excel_worksheet_to_table(session, stage_file_path, target_table): 
    with SnowflakeFile.open(stage_file_path, 'rb') as f:
        workbook = load_workbook(f)
        sheet = workbook.active
        data = sheet.values
        columns = next(data)[0:]
        df = pd.DataFrame(data, columns=columns)
        
        df2 = session.create_dataframe(df)
        df2.write.mode("overwrite").save_as_table(target_table)
    
    return True

## Process all Excel worksheets

Loop through each Excel worksheet to process and call our `load_excel_worksheet_to_table_local()` function.

In [None]:
# Process each file from the sql_get_spreadsheets cell above
files_to_load = cells.sql_get_spreadsheets.to_pandas()
for index, excel_file in files_to_load.iterrows():
    logger.info(f"Processing Excel file {excel_file['STAGE_FILE_PATH']}")
    load_excel_worksheet_to_table(session, excel_file['STAGE_FILE_PATH'], excel_file['TARGET_TABLE'])

logger.info("load_excel_files end")

### Debugging

In [None]:
--DESCRIBE TABLE TRANSACTIONS;
--SELECT * FROM TRANSACTIONS;
--SHOW TABLES;