# The following packages are required
%pip install openpyxl

In [7]:
# Parameters
ingestion_output_folder = "../data/raw"
ingestion_output_file = "../data/bronze"

In [8]:
# Import libraries
import json
import sys
sys.path.append("..")
from utils.filesystem import *

In [9]:
# Config file path
config_file = '../config.json'

# Load the config file
with open(config_file) as f:
    config = json.load(f)

# Load the basic config parameters
local_path = config['local_path']
file_name = config['file_name']
sheet_name = config['sheet_name']

# Load the config parameters (ingestion transformation)
name_mapping = config['name_mapping']
columns_to_keep = config['columns_to_keep']



In [10]:
# Ignore UserWarning messages
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load a specific worksheet from the Excel file
try:
    dataframe = load_excel_sheet_to_dataframe (local_path, file_name, sheet_name, 4)

    # Count the number of rows and columns
    rows, columns = dataframe.shape
    print(f"The dataframe has {rows} rows and {columns} columns")
except Exception as e:
    print(f"An error occurred: {e}")

The dataframe has 2503 rows and 17 columns


In [11]:
# Rename the columns
dataframe = dataframe.rename(columns=name_mapping)

# Calculate the columns to drop as the difference between all columns and the columns to keep
columns_to_drop = dataframe.columns.difference(columns_to_keep)

# Drop the columns
dataframe = dataframe.drop(columns=columns_to_drop)

# Display the names of the columns
print(f"Columns: {dataframe.columns}")


Columns: Index(['date', 'observations', 'amount', 'balance', 'category', 'subcategory',
       'detail', 'invoice_number', 'invoice_file_reference'],
      dtype='object')
Saved output.csv
Dataframe saved to ../data/raw\EXTRACTO BANCO 2024\2024/05/15/21/56/19/output.csv


In [12]:
# Test removing a column
try:
    dataframe = dataframe.drop(columns=['invoice_number'])
    print(f"Column 'invoice_number' removed")
except Exception as e:
    print(f"An error occurred: {e}")

Column 'invoice_number' removed


In [None]:
# Persist the dataframe to a CSV file
file_path = create_foldername(ingestion_output_folder, file_name.split('.')[0])
try:
    save_df_to_csv(dataframe, file_path, "output")
    print(f"Dataframe saved to {file_path}/output.csv")
except Exception as e:
    print(f"An error occurred: {e}")

In [13]:
import shutil

# Find the latest folder
latest = find_latest_folder(ingestion_output_folder)
print(f"The latest folder is: {latest}")

# Define the source file and destination directory
source_file = latest + '/' + "output.csv"
destination_dir = '../data/latest'

# Use shutil.move() to copy the latest file
shutil.copy(source_file, destination_dir)

The latest folder is: ../data/raw\EXTRACTO BANCO 2024\2024\05\15\21\56\19


'../data/latest\\output.csv'