In [99]:
import sys
sys.path.append("../")

import os
import numpy as np
import pandas as pd
from collections import OrderedDict
from utils import get_digit_and_unit
from tqdm import tqdm
import random
import math

## Generate dataset
    - km, m, cm, mm
    - kg, g, mg
    - ml, l

In [102]:
def generate_numeric_dataset(size, value_to_multiply_by_unit, target_units, max_precision=4):
    
    df = pd.DataFrame([], columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])
    trials = 0
    
    with tqdm(total=size, unit='rows', desc='Generating dataset') as pbar:
        while len(df) < size:
            value = round(random.uniform(0, 9999), max_precision)
            unit1, unit2 = random.sample(target_units, 2)
            
            if unit1 == unit2:
                continue
            
            value1 = value * value_to_multiply_by_unit[unit1]
            value2 = value * value_to_multiply_by_unit[unit2]
            
            try:
                value1_cnt, value2_cnt = int(math.log10(value1)), int(math.log10(value2))
            except:
                continue
            
            if value1_cnt > max_precision or value2_cnt > max_precision:
                continue

            try:
                value1_decimal_count = len(str(value1).split(".")[1])
                value2_decimal_count = len(str(value2).split(".")[1])
            except:
                continue
            
            if value1_decimal_count > max_precision or value2_decimal_count > max_precision:
                continue
            
            diff = round(random.uniform(0, value*0.2), max_precision)
            diff1 = round(diff * value_to_multiply_by_unit[unit1], max_precision)
            diff2 = round(diff * value_to_multiply_by_unit[unit2], max_precision)
            
            temp = [
                [value1, unit1, "+", 0.0, unit1, value2, unit2],
                [value1, unit1, "-", 0.0, unit1, value2, unit2],
                [value1, unit1, "+", 0.0, unit2, value2, unit2],
                [value1, unit1, "-", 0.0, unit2, value2, unit2],
                [value1, unit1, "+", diff1, unit1, value2+diff2, unit2],
                [value1, unit1, "-", diff1, unit1, value2-diff2, unit2],
                [value1, unit1, "+", diff2, unit2, value2+diff2, unit2],
                [value1, unit1, "-", diff2, unit2, value2-diff2, unit2],
                [value1, unit1, "+", diff1, unit1, value1+diff1, unit1],
                [value1, unit1, "-", diff1, unit1, value1-diff1, unit1],
                [value1, unit1, "+", diff2, unit2, value1+diff1, unit1],
                [value1, unit1, "-", diff2, unit2, value1-diff1, unit1],
            ]
            
            df = pd.concat([df, pd.DataFrame(temp, columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])])
            df = df.drop_duplicates()
            df = df.reset_index(drop=True)
            trials += 1
            pbar.update(len(df) - pbar.n)
    # shuffle and sample size n
    df = df.sample(n=size).reset_index(drop=True)
    print(f"Trials: {trials}")
    return df

In [103]:
# Generate length data

size = 10000
value_to_multiply_by_unit = {
    "km": 0.001,
    "m": 1,
    "cm": 100,
    "mm": 1000,
}
target_units = ["km", "cm", "mm", "m"]

length_df = generate_numeric_dataset(size, value_to_multiply_by_unit, target_units)
length_df

Generating dataset: 10008rows [00:03, 2855.91rows/s]                        

Trials: 834





Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,47120.4200,cm,+,1893.0000,cm,49013.4200,cm
1,63045.1900,cm,-,0.0000,cm,630.4519,m
2,773.0612,m,-,3155.3000,cm,741.5082,m
3,31.5104,m,+,3.2078,m,34718.2000,mm
4,54.0994,m,+,3.2409,m,57340.3000,mm
...,...,...,...,...,...,...,...
9995,5718.5500,cm,+,5223.8000,mm,62409.3000,mm
9996,58981.4100,cm,+,10866.2500,cm,69847.6600,cm
9997,47.7970,m,-,0.0000,m,4779.7000,cm
9998,55115.5400,cm,-,71.8593,m,47929.6100,cm


In [104]:
# Generate volume data

size = 10000
value_to_multiply_by_unit = {
    "l": 1,
    "ml": 1000,
}
target_units = ["l", "ml"]

volume_df = generate_numeric_dataset(size, value_to_multiply_by_unit, target_units)
volume_df

Generating dataset: 10004rows [00:03, 2582.32rows/s]                        

Trials: 834





Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,77.6004,l,+,2.9875,l,80.5879,l
1,23.3568,l,+,0.0000,ml,23356.8000,ml
2,75149.7000,ml,-,6.5162,l,68.6335,l
3,57668.0000,ml,-,0.0000,ml,57.6680,l
4,98336.7000,ml,+,9468.9000,ml,107805.6000,ml
...,...,...,...,...,...,...,...
9995,48958.6000,ml,-,0.0000,l,48.9586,l
9996,10934.0000,ml,+,0.0000,ml,10.9340,l
9997,16983.3000,ml,+,2.0511,l,19034.4000,ml
9998,48257.5000,ml,+,6.3858,l,54643.3000,ml


In [105]:
# Generate weight data

size = 10000
value_to_multiply_by_unit = {
    "kg": 0.001,
    "g": 1,
    "mg": 1000,
}
target_units = ["kg", "g", "mg"]

weight_df = generate_numeric_dataset(size, value_to_multiply_by_unit, target_units)
weight_df

Generating dataset: 10008rows [00:04, 2018.08rows/s]                        

Trials: 834





Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,7.5824,kg,+,0.0000,g,7582.4000,g
1,47.0815,g,+,4.2525,g,51334.0000,mg
2,27299.2000,mg,-,3.9190,g,23380.2000,mg
3,13531.5000,mg,+,2.2231,g,15.7546,g
4,47.5430,g,-,9.1893,g,38.3537,g
...,...,...,...,...,...,...,...
9995,45.3489,g,+,0.0000,mg,45348.9000,mg
9996,40.2109,g,+,4.8372,g,45.0481,g
9997,49120.1000,mg,-,0.0000,mg,49.1201,g
9998,30.6567,g,+,1669.0000,mg,32.3257,g


In [106]:
def generate_int_numeric_dataset(size, target_units, max_value=100):
    
    df = pd.DataFrame([], columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])
    trials = 0
    
    with tqdm(total=size, unit='rows', desc='Generating dataset') as pbar:
        while len(df) < size:
            value = int(random.uniform(0, max_value))
            unit = random.sample(target_units, 1)[0]      
            diff = int(random.uniform(0, value//2))
            
            if diff == 0:
                continue

            temp = [
                [value, unit, "+", diff, unit, value+diff, unit],
                [value, unit, "-", diff, unit, value-diff, unit],
            ]
            
            df = pd.concat([df, pd.DataFrame(temp, columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])])
            df = df.drop_duplicates()
            df = df.reset_index(drop=True)
            trials += 1
            pbar.update(len(df) - pbar.n)
    df = df.sample(n=size).reset_index(drop=True)
    print(f"Trials: {trials}")
    return df

In [107]:
# Generate age data

size = 5000
target_units = ["age"]
max_value = 150

age_df = generate_int_numeric_dataset(size, target_units, max_value)
age_df

Generating dataset: 100%|██████████| 5000/5000 [00:13<00:00, 359.42rows/s]

Trials: 3867





Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,47,age,+,16,age,63,age
1,138,age,-,51,age,87,age
2,134,age,+,35,age,169,age
3,120,age,-,19,age,101,age
4,135,age,-,66,age,69,age
...,...,...,...,...,...,...,...
4995,103,age,-,45,age,58,age
4996,57,age,-,9,age,48,age
4997,119,age,-,34,age,85,age
4998,144,age,-,43,age,101,age


In [108]:
# Generate year data

size = 5000
target_units = ["year"]
max_value = 2050

year_df = generate_int_numeric_dataset(size, target_units, max_value)
year_df

Generating dataset: 100%|██████████| 5000/5000 [00:08<00:00, 564.71rows/s]

Trials: 2508





Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,402,year,-,192,year,210,year
1,227,year,-,78,year,149,year
2,1466,year,+,1,year,1467,year
3,1105,year,-,159,year,946,year
4,1326,year,-,257,year,1069,year
...,...,...,...,...,...,...,...
4995,1854,year,+,501,year,2355,year
4996,1061,year,+,484,year,1545,year
4997,1711,year,+,717,year,2428,year
4998,565,year,+,61,year,626,year


In [109]:
total_data = pd.concat([length_df, volume_df, weight_df, age_df, year_df])
total_data = total_data.reset_index(drop=True)
total_data

Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,47120.42,cm,+,1893.0,cm,49013.42,cm
1,63045.19,cm,-,0.0,cm,630.4519,m
2,773.0612,m,-,3155.3,cm,741.5082,m
3,31.5104,m,+,3.2078,m,34718.2,mm
4,54.0994,m,+,3.2409,m,57340.3,mm
...,...,...,...,...,...,...,...
39995,1854,year,+,501,year,2355,year
39996,1061,year,+,484,year,1545,year
39997,1711,year,+,717,year,2428,year
39998,565,year,+,61,year,626,year


In [113]:
# data_format_v1
# =, <, >
# m, cm, km, mm, l, ml, kg, g, mg

data_v1 = []

for idx, row in total_data.iterrows():
    value1 = row["value1"]
    unit1 = row["unit1"]
    operation = row["operation"]
    diff_value = row["diff_value"]
    diff_unit = row["diff_unit"]
    value2 = row["value2"]
    unit2 = row["unit2"]
    
    if unit1 in ["year", "age"] or unit2 in ["year", "age"]:
        continue
    
    if diff_value == 0:
        data_v1.append([value1, unit1, "=", value2, unit2])
    else:
        if operation == "+":
            data_v1.append([value1, unit1, "<", value2, unit2])
        elif operation == "-":
            data_v1.append([value1, unit1, ">", value2, unit2])

data_v1 = pd.DataFrame(data_v1, columns=["value1", "unit1", "operation", "value2", "unit2"])
data_v1

Unnamed: 0,value1,unit1,operation,value2,unit2
0,47120.4200,cm,<,49013.4200,cm
1,63045.1900,cm,=,630.4519,m
2,773.0612,m,>,741.5082,m
3,31.5104,m,<,34718.2000,mm
4,54.0994,m,<,57340.3000,mm
...,...,...,...,...,...
29995,45.3489,g,=,45348.9000,mg
29996,40.2109,g,<,45.0481,g
29997,49120.1000,mg,=,49.1201,g
29998,30.6567,g,<,32.3257,g


In [114]:
# data_format_v1_5
# =, <, >
# m, cm, km, mm, l, ml, kg, g, mg, age, year

data_v1_5 = []

for idx, row in total_data.iterrows():
    value1 = row["value1"]
    unit1 = row["unit1"]
    operation = row["operation"]
    diff_value = row["diff_value"]
    diff_unit = row["diff_unit"]
    value2 = row["value2"]
    unit2 = row["unit2"]
    
    if diff_value == 0:
        data_v1_5.append([value1, unit1, "=", value2, unit2])
    else:
        if operation == "+":
            data_v1_5.append([value1, unit1, "<", value2, unit2])
        elif operation == "-":
            data_v1_5.append([value1, unit1, ">", value2, unit2])

data_v1_5 = pd.DataFrame(data_v1_5, columns=["value1", "unit1", "operation", "value2", "unit2"])
data_v1_5

Unnamed: 0,value1,unit1,operation,value2,unit2
0,47120.4200,cm,<,49013.4200,cm
1,63045.1900,cm,=,630.4519,m
2,773.0612,m,>,741.5082,m
3,31.5104,m,<,34718.2000,mm
4,54.0994,m,<,57340.3000,mm
...,...,...,...,...,...
39995,1854.0000,year,<,2355.0000,year
39996,1061.0000,year,<,1545.0000,year
39997,1711.0000,year,<,2428.0000,year
39998,565.0000,year,<,626.0000,year


In [117]:
# data_v2
# +, -, 
# m, cm, km, mm, l, ml, kg, g, mg

data_v2 = []

for idx, row in total_data.iterrows():
    value1 = row["value1"]
    unit1 = row["unit1"]
    operation = row["operation"]
    diff_value = row["diff_value"]
    diff_unit = row["diff_unit"]
    value2 = row["value2"]
    unit2 = row["unit2"]
    
    if unit1 in ["year", "age"] or unit2 in ["year", "age"]:
        continue
    
    data_v2.append([value1, unit1, operation, diff_value, diff_unit, value2, unit2])

data_v2 = pd.DataFrame(data_v2, columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])
data_v2

Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,47120.4200,cm,+,1893.0000,cm,49013.4200,cm
1,63045.1900,cm,-,0.0000,cm,630.4519,m
2,773.0612,m,-,3155.3000,cm,741.5082,m
3,31.5104,m,+,3.2078,m,34718.2000,mm
4,54.0994,m,+,3.2409,m,57340.3000,mm
...,...,...,...,...,...,...,...
29995,45.3489,g,+,0.0000,mg,45348.9000,mg
29996,40.2109,g,+,4.8372,g,45.0481,g
29997,49120.1000,mg,-,0.0000,mg,49.1201,g
29998,30.6567,g,+,1669.0000,mg,32.3257,g


In [118]:
# data_v2_5
# +, -, 
# m, cm, km, mm, l, ml, kg, g, mg, age, year

data_v2_5 = []

for idx, row in total_data.iterrows():
    value1 = row["value1"]
    unit1 = row["unit1"]
    operation = row["operation"]
    diff_value = row["diff_value"]
    diff_unit = row["diff_unit"]
    value2 = row["value2"]
    unit2 = row["unit2"]
    
    data_v2_5.append([value1, unit1, operation, diff_value, diff_unit, value2, unit2])
    
data_v2_5 = pd.DataFrame(data_v2_5, columns=["value1", "unit1", "operation", "diff_value", "diff_unit", "value2", "unit2"])
data_v2_5

Unnamed: 0,value1,unit1,operation,diff_value,diff_unit,value2,unit2
0,47120.4200,cm,+,1893.0000,cm,49013.4200,cm
1,63045.1900,cm,-,0.0000,cm,630.4519,m
2,773.0612,m,-,3155.3000,cm,741.5082,m
3,31.5104,m,+,3.2078,m,34718.2000,mm
4,54.0994,m,+,3.2409,m,57340.3000,mm
...,...,...,...,...,...,...,...
39995,1854.0000,year,+,501.0000,year,2355.0000,year
39996,1061.0000,year,+,484.0000,year,1545.0000,year
39997,1711.0000,year,+,717.0000,year,2428.0000,year
39998,565.0000,year,+,61.0000,year,626.0000,year


In [110]:
# output_path = "/workspace/Data/NumericalNet/20240406_numerical_dataset_v2_40000.xlsx"

# total_data.to_excel(output_path, index=False, engine='openpyxl')