In [109]:
import sys
sys.path.append("../")

import os
import pandas as pd
from utils import get_digit_and_unit

In [110]:
data_path = "/workspace/Data/NumericalNet/20240401_numericalnet_data_small.xlsx"

with pd.ExcelFile(data_path) as xls:
    df_equal = pd.read_excel(xls, 'equal')
    df_greater = pd.read_excel(xls, 'greater')
    df_less = pd.read_excel(xls, 'less')
    
data = pd.concat([df_equal, df_greater, df_less], ignore_index=True)
data

Unnamed: 0,value1,operation,value2,type
0,678.8mm,=,67.88cm,length
1,302.78cm,=,3027.8mm,length
2,109.19mm,=,10.919cm,length
3,479.32cm,=,4793.2mm,length
4,83.32mm,=,8.332cm,length
...,...,...,...,...
18160,1559.5ml,<,1.58544l,
18161,9733.4ml,<,12.37443l,
18162,4343.3ml,<,6.07581l,
18163,9.21l,<,13396.19ml,


In [111]:
unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}

data_v2 = []

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    operation = row['operation']
    
    converted_value1 = round(value1 * unit_conversions[unit1], 4)
    converted_value2 = round(value2 * unit_conversions[unit2], 4)
    
    if operation == "=":
        data_v2.append(f"{value1}{unit1}+{0}{unit1}={value2}{unit2}")
        data_v2.append(f"{value1}{unit1}-{0}{unit1}={value2}{unit2}")
        data_v2.append(f"{value2}{unit2}+{0}{unit2}={value1}{unit1}")
        data_v2.append(f"{value2}{unit2}-{0}{unit2}={value1}{unit1}")
        
    elif operation == "<":
        diff = converted_value2 - converted_value1
        diff_unit1 = round(diff / unit_conversions[unit1], 4)
        diff_unit2 = round(diff / unit_conversions[unit2], 4)
        
        data_v2.append(f"{value1}{unit1}+{diff_unit1}{unit1}={value2}{unit2}")
        data_v2.append(f"{value1}{unit1}+{diff_unit2}{unit2}={value2}{unit2}")
        data_v2.append(f"{value2}{unit2}-{diff_unit2}{unit2}={value1}{unit1}")
        data_v2.append(f"{value2}{unit2}-{diff_unit1}{unit1}={value1}{unit1}")
        
    elif operation == ">":
        diff = converted_value1 - converted_value2
        diff_unit1 = round(diff / unit_conversions[unit1], 4)
        diff_unit2 = round(diff / unit_conversions[unit2], 4)
        
        data_v2.append(f"{value1}{unit1}-{diff_unit1}{unit1}={value2}{unit2}")
        data_v2.append(f"{value1}{unit1}-{diff_unit2}{unit2}={value2}{unit2}")
        data_v2.append(f"{value2}{unit2}+{diff_unit2}{unit2}={value1}{unit1}")
        data_v2.append(f"{value2}{unit2}+{diff_unit1}{unit1}={value1}{unit1}")

data_v2 = pd.DataFrame(data_v2, columns=["data"])
data_v2

Unnamed: 0,data
0,678.8mm+0mm=67.88cm
1,678.8mm-0mm=67.88cm
2,67.88cm+0cm=678.8mm
3,67.88cm-0cm=678.8mm
4,302.78cm+0cm=3027.8mm
...,...
72655,13396.19ml-4.1862l=9.21l
72656,2.18l+0.4736l=2653.64ml
72657,2.18l+473.6ml=2653.64ml
72658,2653.64ml-473.6ml=2.18l


In [112]:
# Split data into value1, unit1, operation1, value2, unit2, operation2, value3, unit3

new_data_v2 = []

for idx, row in data_v2.iterrows():
    left_side, right_side = row['data'].split("=")
    operation2 = "="
    if "+" in left_side:
        operation1 = "+"
    elif "-" in left_side:
        operation1 = "-"
    value1_unit1, value2_unit2 = left_side.split(operation1)

    new_data_v2.append([value1_unit1, operation1, value2_unit2, operation2, right_side])

new_data_v2 = pd.DataFrame(new_data_v2, columns=["value1", "operation1", "value2", "operation2", "value3"])
new_data_v2

Unnamed: 0,value1,operation1,value2,operation2,value3
0,678.8mm,+,0mm,=,67.88cm
1,678.8mm,-,0mm,=,67.88cm
2,67.88cm,+,0cm,=,678.8mm
3,67.88cm,-,0cm,=,678.8mm
4,302.78cm,+,0cm,=,3027.8mm
...,...,...,...,...,...
72655,13396.19ml,-,4.1862l,=,9.21l
72656,2.18l,+,0.4736l,=,2653.64ml
72657,2.18l,+,473.6ml,=,2653.64ml
72658,2653.64ml,-,473.6ml,=,2.18l


In [113]:
output_path = "/workspace/Data/NumericalNet/20240401_numericalnet_data_small_processed.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
new_data_v2.to_excel(output_path, index=False, engine='openpyxl')