In [26]:
import sys
sys.path.append("../")

import os
import numpy as np
import pandas as pd
from collections import OrderedDict
from utils import get_digit_and_unit
from tqdm import tqdm
import random

## Generate dataset
    - km, m, cm, mm
    - kg, g, mg
    - ml, l

In [2]:
data = pd.DataFrame([], columns=["value1", "operation", "value2"])
size = 1000000
gen_size = size*5

In [3]:
# Generate km data (float)
km_data = np.round(np.logspace(-6, np.log10(5), num=gen_size), 6)

# Generate corresponding m, cm, and mm data
m_data = np.round(km_data * 1000, 6)
cm_data = np.round(m_data * 100, 6)
mm_data = np.round(cm_data * 10, 6)

# Create a DataFrame
df = pd.DataFrame()

# Add km, m, cm, and mm columns to the DataFrame
df['km'] = np.random.choice(km_data, size=size)
df['m'] = np.round(df['km'] * 1000, 6)
df['cm'] = np.round(df['m'] * 100, 6)
df['mm'] = np.round(df['cm'] * 10, 6)

# Create a list to store the pair combinations
pair_combinations = [
    ('km', 'm'),
    ('km', 'cm'),
    ('km', 'mm'),
    ('m', 'cm'),
    ('m', 'mm'),
    ('cm', 'mm')
]

# Iterate over each pair combination
for col1, col2 in pair_combinations:
    temp_data = []
    for idx, row in df.iterrows():
        value1 = f"{row[col1]}{col1}"
        value2 = f"{row[col2]}{col2}"
        temp_data.append({"value1": value1, "operation": "=", "value2": value2})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.009323km,=,9.323m
2,3.5e-05km,=,0.035m
3,0.000784km,=,0.784m
4,0.542105km,=,542.105m
...,...,...,...
5999995,66088.0cm,=,660880.0mm
5999996,438.0cm,=,4380.0mm
5999997,1.0cm,=,10.0mm
5999998,5.0cm,=,50.0mm


In [4]:
# Generate kg data (float)
kg_data = np.round(np.logspace(np.log10(0.001), np.log10(500), num=gen_size), 6)

# Generate corresponding g and mg data
g_data = np.round(kg_data * 1000, 6)
mg_data = np.round(g_data * 1000, 6)

# Create a DataFrame
df = pd.DataFrame()

# Add kg, g, and mg columns to the DataFrame
df['kg'] = np.random.choice(kg_data, size=size)
df['g'] = np.round(df['kg'] * 1000, 6)
df['mg'] = np.round(df['g'] * 1000, 6)

# Create a list to store the pair combinations
pair_combinations = [
    ('kg', 'g'),
    ('kg', 'mg'),
    ('g', 'mg')
]

# Iterate over each pair combination
for col1, col2 in pair_combinations:
    temp_data = []
    for idx, row in df.iterrows():
        value1 = f"{row[col1]}{col1}"
        value2 = f"{row[col2]}{col2}"
        temp_data.append({"value1": value1, "operation": "=", "value2": value2})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.009323km,=,9.323m
2,3.5e-05km,=,0.035m
3,0.000784km,=,0.784m
4,0.542105km,=,542.105m
...,...,...,...
8999995,1661.067g,=,1661067.0mg
8999996,128.886g,=,128886.0mg
8999997,147.988g,=,147988.0mg
8999998,8769.425g,=,8769425.0mg


In [5]:
# Generate l data (float)
l_data = np.round(np.logspace(np.log10(0.001), np.log10(1000), num=gen_size), 6)

# Generate corresponding ml data
ml_data = np.round(l_data * 1000, 6)

# Create a DataFrame
df = pd.DataFrame()

# Add l and ml columns to the DataFrame
df['l'] = np.random.choice(l_data, size=size)
df['ml'] = np.round(df['l'] * 1000, 6)

# Create a list to store the pair combinations
pair_combinations = [
    ('l', 'ml')
]

# Iterate over each pair combination
for col1, col2 in pair_combinations:
    temp_data = []
    for idx, row in df.iterrows():
        value1 = f"{row[col1]}{col1}"
        value2 = f"{row[col2]}{col2}"
        temp_data.append({"value1": value1, "operation": "=", "value2": value2})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.009323km,=,9.323m
2,3.5e-05km,=,0.035m
3,0.000784km,=,0.784m
4,0.542105km,=,542.105m
...,...,...,...
9999995,0.071938l,=,71.938ml
9999996,5.526517l,=,5526.517ml
9999997,915.328937l,=,915328.937ml
9999998,0.007621l,=,7.621ml


In [43]:
# # Generate km² data (float)
# km2_data = np.round(np.logspace(np.log10(0.000001), np.log10(1), num=gen_size), 6)

# # Generate corresponding m² data
# m2_data = np.round(km2_data * 1000000, 6)

# # Create a DataFrame
# df = pd.DataFrame()

# # Add km² and m² columns to the DataFrame
# df['km2'] = np.random.choice(km2_data, size=size)
# df['m2'] = np.round(df['km2'] * 1000000, 6)

# # Create a list to store the pair combinations
# pair_combinations = [
#     ('km2', 'm2')
# ]

# # Iterate over each pair combination
# for col1, col2 in pair_combinations:
#     temp_data = []
#     for idx, row in df.iterrows():
#         value1 = f"{row[col1]}{col1}"
#         value2 = f"{row[col2]}{col2}"
#         temp_data.append({"value1": value1, "operation": "=", "value2": value2})
#     temp_df = pd.DataFrame(temp_data)
#     data = pd.concat([data, temp_df], ignore_index=True)

# data.reset_index(drop=True, inplace=True)
# data

## Preprocess generated dataset

In [6]:
# if there is "e" in the data, remove it

delete_idx = []

for idx, row in data.iterrows():
    if 'e' in row['value1'] or 'e' in row['value2']:
        delete_idx.append(idx)
        
# Show me the rows that contain "e"
data.loc[delete_idx]

Unnamed: 0,value1,operation,value2
2,3.5e-05km,=,0.035m
6,9.3e-05km,=,0.093m
9,2e-06km,=,0.002m
16,4e-06km,=,0.004m
19,2e-06km,=,0.002m
...,...,...,...
2999975,6.9e-05km,=,69.0mm
2999981,6e-06km,=,6.0mm
2999984,2.5e-05km,=,25.0mm
2999997,1e-05km,=,10.0mm


In [10]:
# drop the rows with "e" in the data

data.drop(delete_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.009323km,=,9.323m
2,0.000784km,=,0.784m
3,0.542105km,=,542.105m
4,0.058953km,=,58.953m
...,...,...,...
9104981,0.071938l,=,71.938ml
9104982,5.526517l,=,5526.517ml
9104983,915.328937l,=,915328.937ml
9104984,0.007621l,=,7.621ml


In [11]:
# Categorize the digit
def categorize_digit(digit):
    if 0 <= digit < 1e-10:
        return "[0, 1e-10]"
    elif 1e-10 <= digit < 1e-9:
        return "[1e-10, 1e-9]"
    elif 1e-9 <= digit < 1e-8:
        return "[1e-9, 1e-8]"
    elif 1e-8 <= digit < 1e-7:
        return "[1e-8, 1e-7]"
    elif 1e-7 <= digit < 1e-6:
        return "[1e-7, 1e-6]"
    elif 1e-6 <= digit < 1e-5:
        return "[1e-6, 1e-5]"
    elif 1e-5 <= digit < 1e-4:
        return "[1e-5, 1e-4]"
    elif 1e-4 <= digit < 1e-3:
        return "[1e-4, 1e-3]"
    elif 1e-3 <= digit < 1e-2:
        return "[1e-3, 1e-2]"
    elif 1e-2 <= digit < 1e-1:
        return "[1e-2, 1e-1]"
    elif 1e-1 <= digit < 1:
        return "[1e-1, 1]"
    elif 1 <= digit < 10:
        return "[1, 10]"
    elif 10 <= digit < 100:
        return "[10, 100]"
    elif 100 <= digit < 1000:
        return "[100, 1000]"
    elif 1000 <= digit < 10000:
        return "[1000, 10000]"
    elif 10000 <= digit < 100000:
        return "[10000, 100000]"
    elif 100000 <= digit < 1000000:
        return "[100000, 1000000]"
    elif 1000000 <= digit < 1e7:
        return "[1000000, 1e7]"
    elif 1e7 <= digit < 1e8:
        return "[1e7, 1e8]"
    elif 1e8 <= digit < 1e9:
        return "[1e8, 1e9]"
    elif 1e9 <= digit < 1e10:
        return "[1e9, 1e10]"
    else:
        return "[1e10, ..]"

digit_range_count = OrderedDict([
    ("[0, 1e-10]", 0),
    ("[1e-10, 1e-9]", 0),
    ("[1e-9, 1e-8]", 0),
    ("[1e-8, 1e-7]", 0),
    ("[1e-7, 1e-6]", 0),
    ("[1e-6, 1e-5]", 0),
    ("[1e-5, 1e-4]", 0),
    ("[1e-4, 1e-3]", 0),
    ("[1e-3, 1e-2]", 0),
    ("[1e-2, 1e-1]", 0),
    ("[1e-1, 1]", 0),
    ("[1, 10]", 0),
    ("[10, 100]", 0),
    ("[100, 1000]", 0),
    ("[1000, 10000]", 0),
    ("[10000, 100000]", 0),
    ("[100000, 1000000]", 0),
    ("[1000000, 1e7]", 0),
    ("[1e7, 1e8]", 0),
    ("[1e8, 1e9]", 0),
    ("[1e9, 1e10]", 0),
    ("[1e10, ..]", 0)
])

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    for digit in [value1, value2]:
        category = categorize_digit(digit)
        digit_range_count[category] += 1
        
# Print the count for each digit range
for range_str, count in digit_range_count.items():
    print(f"{range_str}: {count}")

[0, 1e-10]: 0
[1e-10, 1e-9]: 0
[1e-9, 1e-8]: 0
[1e-8, 1e-7]: 0
[1e-7, 1e-6]: 0
[1e-6, 1e-5]: 0
[1e-5, 1e-4]: 0
[1e-4, 1e-3]: 447429
[1e-3, 1e-2]: 1256839
[1e-2, 1e-1]: 1269333
[1e-1, 1]: 1706244
[1, 10]: 2393020
[10, 100]: 2234725
[100, 1000]: 2273543
[1000, 10000]: 2077845
[10000, 100000]: 1764498
[100000, 1000000]: 1525457
[1000000, 1e7]: 665121
[1e7, 1e8]: 350768
[1e8, 1e9]: 245150
[1e9, 1e10]: 0
[1e10, ..]: 0


In [14]:
invalid_idx = []
max_digit_length = 5
for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    if len(str(value1).split(".")[0]) > max_digit_length or len(str(value2).split(".")[0]) > max_digit_length:
        invalid_idx.append(idx)
        
    if len(str(value1).split(".")[1]) > max_digit_length or len(str(value2).split(".")[1]) > max_digit_length:
        invalid_idx.append(idx)

data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2
1,0.009323km,=,9.323m
2,0.000784km,=,0.784m
3,0.542105km,=,542.105m
4,0.058953km,=,58.953m
5,0.000552km,=,0.552m
...,...,...,...
7843943,5.526517l,=,5526.517ml
7843944,915.328937l,=,915328.937ml
7843944,915.328937l,=,915328.937ml
7843945,0.007621l,=,7.621ml


In [15]:
data.drop(invalid_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.29577km,=,295.77m
2,0.0026km,=,2.6m
3,0.00561km,=,5.61m
4,0.00362km,=,3.62m
...,...,...,...
3120449,44.27414l,=,44274.14ml
3120450,2.89187l,=,2891.87ml
3120451,0.29033l,=,290.33ml
3120452,24.2592l,=,24259.2ml


In [16]:
# delete duplicated rows

data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.29577km,=,295.77m
2,0.0026km,=,2.6m
3,0.00561km,=,5.61m
4,0.00362km,=,3.62m
...,...,...,...
602805,0.46237l,=,462.37ml
602806,0.83469l,=,834.69ml
602807,44.27414l,=,44274.14ml
602808,2.89187l,=,2891.87ml


In [18]:
# # output the data to a csv file

# output_path = "/workspace/Data/NumericalNet/numericalnet_data_equal.xlsx"

# with pd.ExcelWriter(output_path) as writer:
#     data.to_excel(writer, index=False)

## Generate Comparison Dataset

In [24]:
unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}
comparison_data = []

for idx, row in tqdm(data.iterrows(), total=len(data)):
    value1, value2 = row['value1'], row['value2']
    digit1, unit1 = get_digit_and_unit(value1)
    digit2, unit2 = get_digit_and_unit(value2)
    
    # Convert to the same unit
    digit1_converted = digit1 * unit_conversions[unit1]
    digit2_converted = digit2 * unit_conversions[unit2]
    
    operations = ['<', '>']
    for operation in operations:
        random_value = round(random.uniform(0.0001, digit1_converted*0.5), 5)
        if operation == '<':
            new_digit2_converted = digit2_converted + random_value
            new_digit2 = round(new_digit2_converted / unit_conversions[unit2], 5)
            comparison_data.append([f"{digit1}{unit1}", operation, f"{new_digit2}{unit2}"])
        else:
            new_digit1_converted = digit1_converted + random_value
            new_digit1 = round(new_digit1_converted / unit_conversions[unit1], 5)
            comparison_data.append([f"{new_digit1}{unit1}", operation, f"{digit2}{unit2}"])

comparison_data = pd.DataFrame(comparison_data, columns=["value1", "operation", "value2"])
comparison_data

  0%|          | 0/602810 [00:00<?, ?it/s]

100%|██████████| 602810/602810 [00:38<00:00, 15768.73it/s]


Unnamed: 0,value1,operation,value2
0,0.07279km,<,75.04019m
1,0.0831km,>,72.79m
2,0.29577km,<,345.15637m
3,0.4244km,>,295.77m
4,0.0026km,<,2.93016m
...,...,...,...
1205615,62.70551l,>,44274.14ml
1205616,2.89187l,<,4131.82ml
1205617,3.10917l,>,2891.87ml
1205618,0.29033l,<,410.76ml


In [25]:
greater_data = comparison_data[comparison_data["operation"] == ">"].reset_index(drop=True)
less_data = comparison_data[comparison_data["operation"] == "<"].reset_index(drop=True)

In [28]:
# Delete duplicated in greater_data

greater_data.drop_duplicates(inplace=True)
greater_data.reset_index(drop=True, inplace=True)
greater_data

Unnamed: 0,value1,operation,value2
0,0.0831km,>,72.79m
1,0.4244km,>,295.77m
2,0.00355km,>,2.6m
3,0.00719km,>,5.61m
4,0.00377km,>,3.62m
...,...,...,...
602805,0.55667l,>,462.37ml
602806,0.88184l,>,834.69ml
602807,62.70551l,>,44274.14ml
602808,3.10917l,>,2891.87ml


In [29]:
# Delete duplicated in less_data

less_data.drop_duplicates(inplace=True)
less_data.reset_index(drop=True, inplace=True)
less_data

Unnamed: 0,value1,operation,value2
0,0.07279km,<,75.04019m
1,0.29577km,<,345.15637m
2,0.0026km,<,2.93016m
3,0.00561km,<,6.98215m
4,0.00362km,<,5.0009m
...,...,...,...
602805,0.46237l,<,637.48ml
602806,0.83469l,<,1159.93ml
602807,44.27414l,<,58299.49ml
602808,2.89187l,<,4131.82ml


In [30]:
# Save the data into excel file with different sheets name: equal, greater, less

output_path = "/workspace/Data/NumericalNet/20240330_numericalnet_data.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with pd.ExcelWriter(output_path) as writer:
    data.to_excel(writer, sheet_name='equal', index=False, engine='openpyxl')
    greater_data.to_excel(writer, sheet_name='greater', index=False, engine='openpyxl')
    less_data.to_excel(writer, sheet_name='less', index=False, engine='openpyxl')