In [1]:
# import essential libraries
import pandas as pd
import numpy as np

In [2]:
# load the dataset
csv2023 = pd.read_csv('data/biobasic2023.csv')
csv2024 = pd.read_csv('data/biobasic2024.csv')

In [3]:
# define a function to process the ingested csv file
def process_csv(data): 
    # subset the data for manipulation
    data2 = data.iloc[1:, 0:8]
    
    # rename the columns
    data2.columns = ['Order_ID','Order_Date','Status','Lab','Package','User','Type','Number_Samples']

    # removed columns not used in analysis
    data4 = data2.drop(columns=['Status', 'Lab', 'Package', 'Type'], axis=1)
    
    # exclude data with missing value
    data4.dropna(inplace=True)
    
    # extract the user name
    data4['User'] = data4['User'].apply(lambda x: x.split(' (')[0])
    
    # convert the order_date into datetime format
    data4['Order_Date'] = pd.to_datetime(data4['Order_Date'], format='%Y-%m-%d', errors='coerce')
    
    fil = data4[data4['Order_Date'].isna()]
    data5 = data4.drop(index=fil.index.tolist())
    
    data5['Order_Date'] = data5['Order_Date'].dt.strftime('%Y-%m-%d')
    
    return data5

In [4]:
# process the datasets
df2023 = process_csv(csv2023)
df2024 = process_csv(csv2024)

# display the summary of the processed dataset
print(df2023.info())
print(df2024.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 270
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Order_ID        234 non-null    object
 1   Order_Date      234 non-null    object
 2   User            234 non-null    object
 3   Number_Samples  234 non-null    object
dtypes: object(4)
memory usage: 9.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 1 to 157
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Order_ID        137 non-null    object
 1   Order_Date      137 non-null    object
 2   User            137 non-null    object
 3   Number_Samples  137 non-null    object
dtypes: object(4)
memory usage: 5.4+ KB
None


In [5]:
# merged the 2 dataframes
invoices = pd.concat([df2023, df2024], axis=0)

# sort the dataframe by the order date
sorted_invoice = invoices.sort_values(by='Order_Date')

# export the sorted dataframe into Excel file
sorted_invoice.to_excel('data/biobasic_invoice.xlsx')