Extract the imported zip file

In [62]:
import zipfile
import os

zip_path = os.path.join('cmt-files/imported-data', f'data-data.zip')
extract_dir = 'cmt-files/imported-data'

# Check if the file exists
if not os.path.exists(zip_path):
    print(f"Error: The file '{zip_path}' does not exist.")
else:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        print(f"Extracted files to '{extract_dir}'")

Error: The file 'cmt-files/imported-data\data-data.zip' does not exist.


To convert data.xml into a Dataframe

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path

# Define the XML file path
data_file = Path("cmt-files/imported-data/data.xml")

# Parse the XML file and extract fields with their parent record IDs
def parse_xml_with_all_attributes(file_path):
    records = []
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract the name attribute from the entity element
    table_name = root.find(".//entity").get("name") if root.find(".//entity") is not None else None

    # Iterate over each <record> element
    for record in root.findall(".//record"):
        record_id = record.get("id")  # Extract the record ID
        # Iterate over each <field> within the <record>
        for field in record.findall("field"):
            field_data = {key: field.get(key) for key in field.keys()}  # Extract all field attributes
            field_data["record_id"] = record_id  # Add the parent record ID

            # Check if lookupentity attribute exists
            if field.get("lookupentity"):
                # Add lookupentityname as a separate row
                records.append({
                    "record_id": record_id,
                    "name": field.get("name") + "_name",
                    "value": field.get("lookupentityname")
                })

            records.append(field_data)

    return pd.DataFrame(records), table_name

# Parse the XML and create the DataFrame
df, table_name = parse_xml_with_all_attributes(data_file)

# Pivot the DataFrame
df_pivoted = df.pivot(index="record_id", columns="name", values="value")

# Reset the index if needed
df_pivoted.reset_index(inplace=True)

# Save the DataFrame to a Parquet file
if table_name:
    parquet_file = f"cmt-files/imported-data/{table_name}.parquet"
    df_pivoted.to_parquet(parquet_file, engine="pyarrow")
    print(f"DataFrame saved to {parquet_file}")
else:
    print("Table name is not available. DataFrame not saved.")

# Display the pivoted DataFrame and table name
print(f"Table Name: {table_name}")
print(df_pivoted)

DataFrame saved to cmt-files/imported-data/account.parquet
Table Name: account
name                             record_id accountclassificationcode  \
0     00416cb1-4219-ed11-b83e-000d3acf194f                         1   
1     00a3c8ae-4219-ed11-b83e-000d3acedc8d                         1   
2     019771ab-4219-ed11-b83e-000d3acf194f                         1   
3     01a3c8ae-4219-ed11-b83e-000d3acedc8d                         1   
4     029771ab-4219-ed11-b83e-000d3acf194f                         1   
..                                     ...                       ...   
375   f9a1c8ae-4219-ed11-b83e-000d3acedc8d                         1   
376   fa7ccba8-4219-ed11-b83e-000d3acedc8d                         1   
377   fdb6b908-ded6-ee11-904c-6045bd540c5a                         1   
378   fe406cb1-4219-ed11-b83e-000d3acf194f                         1   
379   ff406cb1-4219-ed11-b83e-000d3acf194f                         1   

name                             accountid accountnumber

In [10]:
import pandas as pd
from pathlib import Path

# Define paths for the imported and formatted data folders
imported_data_path = Path(f"cmt-files/imported-data/{table_name}.parquet")
formatted_data_path = Path(f"cmt-files/formatted-data/{table_name}.parquet")
difference_data_path = Path(f"cmt-files/difference-data/{table_name}.parquet")
print(imported_data_path)
print(formatted_data_path)

# Load the DataFrames from the Parquet files
if imported_data_path.exists() and formatted_data_path.exists():
    imported_df = pd.read_parquet(imported_data_path)
    formatted_df = pd.read_parquet(formatted_data_path)

    # Ensure both DataFrames have a 'modifiedon' column
    if 'modifiedon' in imported_df.columns and 'modifiedon' in formatted_df.columns:
        # Convert 'modifiedon' columns to datetime for comparison
        imported_df['modifiedon'] = pd.to_datetime(imported_df['modifiedon'])
        formatted_df['modifiedon'] = pd.to_datetime(formatted_df['modifiedon'])

        # Find records in imported_df that are newer than the latest in formatted_df
        latest_modifiedon = formatted_df['modifiedon'].max()
        new_records_df = imported_df[imported_df['modifiedon'] > latest_modifiedon]

        # Save the new records to the difference-data folder
        new_records_df.to_parquet(difference_data_path, engine="pyarrow")
        print(f"New records saved to {difference_data_path}")
    else:
        print("Error: 'modifiedon' column is missing in one of the DataFrames.")
else:
    print("Error: One or both Parquet files do not exist.")

cmt-files\imported-data\account.parquet
cmt-files\formatted-data\account.parquet
New records saved to cmt-files\difference-data\account.parquet
