In [1]:
# import required libraries
import pandas as pd
import numpy as np
import os

In [2]:
def trim_dataframe(df):
    # Create a boolean mask where 'Subtotal' is present
    mask = df.apply(lambda row: row.astype(str).str.contains('Subtotal')).any(axis=1)
    
    # Find the index of the first occurrence of 'Subtotal'
    if mask.any():
        subtotal_index = mask.idxmax()  # Get the index of the first True value
        
        # Return the DataFrame up to the row before 'Subtotal'
        trimmed_df = df.iloc[:subtotal_index-1]
    else:
        # If 'Subtotal' is not found, return the original DataFrame
        trimmed_df = df
    
    return trimmed_df

In [3]:
# directory containing HTML files
html_dir = '../data/html_data'

# List to store DataFrames
html_dfs = []

# iterate over files in the directory
for file in os.listdir(html_dir):
    if file.endswith('.html'):
        # Construct the full path to the HTML file
        file_path = os.path.join(html_dir, file)
        
        # Read HTML tables from the file
        tables = pd.read_html(file_path)
        
        # Append the DataFrames to the list
        html_dfs.extend(tables)
        
print('Number of table extracted:', len(html_dfs))

Number of table extracted: 74


In [4]:
processed_dfs = []
for df in html_dfs:
    trimmed_df = trim_dataframe(df.iloc[1:,:])
    processed_dfs.append(trimmed_df.iloc[:,:2])

In [5]:
cont = []
for df in processed_dfs:
    if df.shape[0] == 0:
        continue
    else:
        if '[' in df.iloc[0, 0]:
            cont.append(df)

In [6]:
merged_data = pd.concat(cont, axis=0)
merged_data.columns = ['Description', 'Number_Samples']

In [7]:
# extract the order ID and order date from the description column
merged_data['Order_ID'] = merged_data['Description'].apply(lambda x: x.split(' ')[0]).str.strip('[').str.strip(']')

merged_data['Order_Date'] = merged_data['Order_ID'].apply(lambda x: x.split('F')[0])
merged_data['Order_Date'] = pd.to_datetime(merged_data['Order_Date'], format='%y%m%d')
merged_data['Order_Date'] = merged_data['Order_Date'].dt.strftime('%Y-%m-%d')

In [8]:
# subset the dataframe with selected column, sort by date and export into Excel file
invoice = merged_data[['Order_ID','Order_Date','Number_Samples']]
sorted_invoice = invoice.sort_values(by='Order_Date')
sorted_invoice.to_excel('../data/macrogen_invoice.xlsx', index=False)