In [9]:
import pandas as pd
import os

In [10]:
# Check if all items in data folder have corresponding files in out folder
data_folder = './data'
out_folder = './out'
items = os.listdir(data_folder)
missing_items = []
for item in items:
    # Skip folders and .ipynb_checkpoints
    if os.path.isdir(os.path.join(data_folder, item)) or '.ipynb_checkpoints' in item:
        continue
        
    out_file = os.path.join(out_folder, item)
    if not os.path.exists(out_file):
        missing_items.append(item)

if missing_items:
    print(f"Found {len(missing_items)} missing items:")
    for item in missing_items:
        print(f"- {item}")
else:
    print("No missing items found.")

No missing items found.


In [11]:
# Try to load and compare files
for item in items:
    data_file = os.path.join(data_folder, item)
    out_file = os.path.join(out_folder, item)
    
    print(f"\nComparing {item}:")
    
    # Determine file type and load accordingly
    if item.endswith('.parquet'):
        try:
            data_df = pd.read_parquet(data_file)
            out_df = pd.read_parquet(out_file)
            print("\nData file first 2 rows:")
            print(data_df.head(2))
            print("\nOutput file first 2 rows:")
            print(out_df.head(2))
        except Exception as e:
            print(f"Error loading parquet files: {e}")
            
    elif item.endswith('.json'):
        try:
            data_df = pd.read_json(data_file)
            out_df = pd.read_json(out_file)
            print("\nData file first 2 rows:")
            print(data_df.head(2))
            print("\nOutput file first 2 rows:")
            print(out_df.head(2))
        except Exception as e:
            print(f"Error loading json files: {e}")
            
    else:
        print(f"Unsupported file type for {item}")



Comparing journal_chunk_6.parquet:

Data file first 2 rows:
  company_code posting_date fiscal_year fiscal_period account_number  \
0          IND   2024-08-09        2024           P08          63002   
1          USA   2024-02-27        2024           P02          65003   

  company_currency company_amount global_currency global_amount  \
0              INR       83130.61             USD        996.77   
1              USD         910.51             USD        910.51   

  department_number  ... customer_number product_number channel_number  \
0             DP006  ...         CS08458        P060076        CH00004   
1             DP007  ...         CS08114        P060076        CH00004   

  division_number       transaction_id transaction_type  \
0         DV00004  7317919603814998042            TT002   
1         DV00003  7317919603814998043            TT005   

  transaction_document_number transaction_document_item  \
0                  240809-655                       020   
1