In [1]:
import sys
sys.path.append("../")

import os
import random
import pandas as pd
from tqdm import tqdm
from utils import get_digit_and_unit

data_path = "/workspace/Data/NumericalNet/large_numericalnet_data_equal_v2.xlsx"

In [2]:
data = pd.read_excel(data_path, engine='openpyxl')
data

Unnamed: 0,value1,operation,value2,type
0,857.89m,=,0.85789km,length
1,546.04mm,=,54.604cm,length
2,35.19cm,=,351.9mm,length
3,235.16m,=,0.23516km,length
4,520.0m,=,0.52km,length
...,...,...,...,...
449995,9609.04ml,=,9.60904l,volume
449996,4427.64ml,=,4.42764l,volume
449997,3622.1ml,=,3.6221l,volume
449998,1587.37ml,=,1.58737l,volume


In [3]:
unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}
comparison_data = []

for idx, row in tqdm(data.iterrows(), total=len(data)):
    value1, value2 = row['value1'], row['value2']
    digit1, unit1 = get_digit_and_unit(value1)
    digit2, unit2 = get_digit_and_unit(value2)
    
    # Convert to the same unit
    digit1_converted = digit1 * unit_conversions[unit1]
    digit2_converted = digit2 * unit_conversions[unit2]
    
    operations = ['<', '>']
    for operation in operations:
        random_value = round(random.uniform(0.0001, digit1_converted*0.5), 5)
        if operation == '<':
            new_digit2_converted = digit2_converted + random_value
            new_digit2 = round(new_digit2_converted / unit_conversions[unit2], 5)
            comparison_data.append([f"{digit1}{unit1}", operation, f"{new_digit2}{unit2}"])
        else:
            new_digit1_converted = digit1_converted + random_value
            new_digit1 = round(new_digit1_converted / unit_conversions[unit1], 5)
            comparison_data.append([f"{new_digit1}{unit1}", operation, f"{digit2}{unit2}"])

comparison_data = pd.DataFrame(comparison_data, columns=["value1", "operation", "value2"])
comparison_data

  0%|          | 0/450000 [00:00<?, ?it/s]

100%|██████████| 450000/450000 [00:29<00:00, 15353.34it/s]


Unnamed: 0,value1,operation,value2
0,857.89m,<,1.25962km
1,973.3047m,>,0.85789km
2,546.04mm,<,75.941cm
3,803.56mm,>,54.604cm
4,35.19cm,<,359.52mm
...,...,...,...
899995,4940.41ml,>,3.6221l
899996,1587.37ml,<,1.72542l
899997,2160.14ml,>,1.58737l
899998,2372.59ml,<,2.79147l


In [15]:
# # if there is "e" in the data, remove it

# delete_idx = []

# for idx, row in comparison_data.iterrows():
#     if 'e' in row['value1'] or 'e' in row['value2']:
#         delete_idx.append(idx)
        
# # Show me the rows that contain "e"
# comparison_data.loc[delete_idx]

Unnamed: 0,value1,operation,value2


In [4]:
# delete duplicated rows

comparison_data.drop_duplicates(inplace=True)
comparison_data.reset_index(drop=True, inplace=True)
comparison_data

Unnamed: 0,value1,operation,value2
0,857.89m,<,1.25962km
1,973.3047m,>,0.85789km
2,546.04mm,<,75.941cm
3,803.56mm,>,54.604cm
4,35.19cm,<,359.52mm
...,...,...,...
899995,4940.41ml,>,3.6221l
899996,1587.37ml,<,1.72542l
899997,2160.14ml,>,1.58737l
899998,2372.59ml,<,2.79147l


In [5]:
greater_data = comparison_data[comparison_data["operation"] == ">"].reset_index(drop=True)
less_data = comparison_data[comparison_data["operation"] == "<"].reset_index(drop=True)

In [6]:
# Save the data into excel file with different sheets name: equal, greater, less

output_path = "/workspace/Data/NumericalNet/20240403_numericalnet_data_large_v2.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with pd.ExcelWriter(output_path) as writer:
    data.to_excel(writer, sheet_name='equal', index=False, engine='openpyxl')
    greater_data.to_excel(writer, sheet_name='greater', index=False, engine='openpyxl')
    less_data.to_excel(writer, sheet_name='less', index=False, engine='openpyxl')