In [1]:
import sys
sys.path.append("../")

import os
import random
import pandas as pd
from tqdm import tqdm
from utils import get_digit_and_unit

data_path = "/workspace/Data/NumericalNet/small_numericalnet_data_equal_v2.xlsx"

In [2]:
data = pd.read_excel(data_path, engine='openpyxl')
data

Unnamed: 0,value1,operation,value2,type
0,103.53m,=,0.10353km,length
1,405.51cm,=,4.0551m,length
2,743.24mm,=,74.324cm,length
3,911.22m,=,0.91122km,length
4,863.07mm,=,86.307cm,length
...,...,...,...,...
44995,2469.0ml,=,2.469l,volume
44996,8894.31ml,=,8.89431l,volume
44997,8530.37ml,=,8.53037l,volume
44998,4063.05ml,=,4.06305l,volume


In [14]:
unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}
comparison_data = []

for idx, row in tqdm(data.iterrows(), total=len(data)):
    value1, value2 = row['value1'], row['value2']
    digit1, unit1 = get_digit_and_unit(value1)
    digit2, unit2 = get_digit_and_unit(value2)
    
    # Convert to the same unit
    digit1_converted = digit1 * unit_conversions[unit1]
    digit2_converted = digit2 * unit_conversions[unit2]
    
    operations = ['<', '>']
    for operation in operations:
        random_value = round(random.uniform(0.0001, digit1_converted*0.5), 5)
        if operation == '<':
            new_digit2_converted = digit2_converted + random_value
            new_digit2 = round(new_digit2_converted / unit_conversions[unit2], 5)
            comparison_data.append([f"{digit1}{unit1}", operation, f"{new_digit2}{unit2}"])
        else:
            new_digit1_converted = digit1_converted + random_value
            new_digit1 = round(new_digit1_converted / unit_conversions[unit1], 5)
            comparison_data.append([f"{new_digit1}{unit1}", operation, f"{digit2}{unit2}"])

comparison_data = pd.DataFrame(comparison_data, columns=["value1", "operation", "value2"])
comparison_data

100%|██████████| 45000/45000 [00:02<00:00, 15339.13it/s]


Unnamed: 0,value1,operation,value2
0,103.53m,<,0.11307km
1,111.40869m,>,0.10353km
2,405.51cm,<,6.03097m
3,576.949cm,>,4.0551m
4,743.24mm,<,79.851cm
...,...,...,...
89995,11546.32ml,>,8.53037l
89996,4063.05ml,<,4.77248l
89997,5563.24ml,>,4.06305l
89998,464.3ml,<,0.63199l


In [15]:
# if there is "e" in the data, remove it

delete_idx = []

for idx, row in comparison_data.iterrows():
    if 'e' in row['value1'] or 'e' in row['value2']:
        delete_idx.append(idx)
        
# Show me the rows that contain "e"
comparison_data.loc[delete_idx]

Unnamed: 0,value1,operation,value2


In [16]:
# delete duplicated rows

comparison_data.drop_duplicates(inplace=True)
comparison_data.reset_index(drop=True, inplace=True)
comparison_data

Unnamed: 0,value1,operation,value2
0,103.53m,<,0.11307km
1,111.40869m,>,0.10353km
2,405.51cm,<,6.03097m
3,576.949cm,>,4.0551m
4,743.24mm,<,79.851cm
...,...,...,...
89995,11546.32ml,>,8.53037l
89996,4063.05ml,<,4.77248l
89997,5563.24ml,>,4.06305l
89998,464.3ml,<,0.63199l


In [17]:
greater_data = comparison_data[comparison_data["operation"] == ">"].reset_index(drop=True)
less_data = comparison_data[comparison_data["operation"] == "<"].reset_index(drop=True)

In [18]:
# Save the data into excel file with different sheets name: equal, greater, less

output_path = "/workspace/Data/NumericalNet/20240401_numericalnet_data_small_v2.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with pd.ExcelWriter(output_path) as writer:
    data.to_excel(writer, sheet_name='equal', index=False, engine='openpyxl')
    greater_data.to_excel(writer, sheet_name='greater', index=False, engine='openpyxl')
    less_data.to_excel(writer, sheet_name='less', index=False, engine='openpyxl')