In [1]:
# import required libraries
import pandas as pd
import numpy as np
import os

In [2]:
# function to extract the specific row of the table 
def process_html(file):
        reduced_file = file.iloc[1:-6,:]
        return reduced_file.iloc[:,:2]                              

In [3]:
# directory containing HTML files
html_dir = 'data/html_data'

# List to store DataFrames
html_dfs = []

# iterate over files in the directory
for file in os.listdir(html_dir):
    if file.endswith('.html'):
        # Construct the full path to the HTML file
        file_path = os.path.join(html_dir, file)
        
        # Read HTML tables from the file
        tables = pd.read_html(file_path)
        
        # Append the DataFrames to the list
        html_dfs.extend(tables)
        
print('Number of table extracted:', len(html_dfs))

Number of table extracted: 74


In [4]:
# processed the extracted tables and concat into a single dataframe
processed_df = []

for df in html_dfs:
    processed_df.append(process_html(df))
    
merged_data = pd.concat(processed_df, axis=0)
merged_data.columns = ['Description', 'Number_Samples']
merged_data = merged_data[merged_data['Number_Samples']!='Subtotal']
display(merged_data)

Unnamed: 0,Description,Number_Samples
1,[200331FN-006] PCR product Sequencing,18.00
2,[200401FN-001] PCR product Sequencing,2.00
3,[200401FN-002] PCR product Sequencing,4.00
4,[200402FN-001] PCR product Sequencing,32.00
5,[200402FN-002] PCR product Sequencing,2.00
...,...,...
11,[230508FN-002] PCR product Sequencing,30.00
12,[230509FN-001] PCR product Sequencing,40.00
13,[230510FN-001] PCR product Sequencing,8.00
14,[230510FN-007] PCR product Sequencing,18.00


In [5]:
# extract the order ID and order date from the description column
merged_data['Order_ID'] = merged_data['Description'].apply(lambda x: x.split(' ')[0]).str.strip('[').str.strip(']')

merged_data['Order_Date'] = merged_data['Order_ID'].apply(lambda x: x.split('F')[0])
merged_data['Order_Date'] = pd.to_datetime(merged_data['Order_Date'], format='%y%m%d')
merged_data['Order_Date'] = merged_data['Order_Date'].dt.strftime('%Y-%m-%d')

In [6]:
# subset the dataframe with selected column, sort by date and export into Excel file
invoice = merged_data[['Order_ID','Order_Date','Number_Samples']]
sorted_invoice = invoice.sort_values(by='Order_Date')
sorted_invoice.to_excel('data/marcogen_invoice.xlsx')