In [20]:
import sys
sys.path.append("../")

import os
import numpy as np
import pandas as pd
from collections import OrderedDict
from utils import get_digit_and_unit
from tqdm import tqdm
import random

## Generate dataset
    - km, m, cm, mm
    - kg, g, mg
    - ml, l

In [35]:
data = pd.DataFrame([], columns=["value1", "operation", "value2"])
max_precision = 2

unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}

In [36]:
size = 100000
gen_size = size*5

# Generate m data (float)
km_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)
m_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)
cm_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)
mm_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)

generated_data = {
    "km": km_data,
    "m": m_data,
    "cm": cm_data,
    "mm": mm_data

}

# Create a DataFrame
df = pd.DataFrame()

# Create a list to store the pair combinations
pair_combinations = [
    ('km', 'm'),
    ('m', 'km'),
    ('m', 'cm'),
    ('cm', 'm'),
    ('cm', 'mm'),
    ('mm', 'cm'),
]

value_to_multiple_by_unit = {
    "km2m": 1000,
    "m2km": 0.001,
    "m2cm": 100,
    "cm2m": 0.01,
    "cm2mm": 10,
    "mm2cm": 0.1,
}

precision_mapping = {
    "km2m": -3,
    "m2km": 3,
    "m2cm": -2,
    "cm2m": 2,
    "cm2mm": -1,
    "mm2cm": 1,
}


for unit1, unit2 in pair_combinations:
    temp_data = []        
    for idx in range(size):        
        digit1 = np.round(generated_data[unit1][idx], max_precision)
        digit2 = np.round(generated_data[unit2][idx], max_precision)
        
        converted_digit1 = np.round(digit1 * value_to_multiple_by_unit[f"{unit1}2{unit2}"], max_precision + precision_mapping[f"{unit1}2{unit2}"])
        converted_digit2 = np.round(digit2 * value_to_multiple_by_unit[f"{unit2}2{unit1}"], max_precision + precision_mapping[f"{unit2}2{unit1}"])

        temp_data.append({"value1": f"{digit1}{unit1}", "operation": "=", "value2": f"{converted_digit1}{unit2}", "type": "length"})
        temp_data.append({"value1": f"{digit2}{unit2}", "operation": "=", "value2": f"{converted_digit2}{unit1}", "type": "length"})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2,type
0,397.34km,=,397340.0m,length
1,915.03m,=,0.91503km,length
2,710.07km,=,710070.0m,length
3,412.53m,=,0.41253km,length
4,457.9km,=,457900.0m,length
...,...,...,...,...
1199995,593.13cm,=,5931.3mm,length
1199996,365.15mm,=,36.515cm,length
1199997,699.36cm,=,6993.6mm,length
1199998,528.8mm,=,52.88cm,length


In [37]:
# # Get the half of the data

# data = data.sample(frac=0.5)
# data.reset_index(drop=True, inplace=True)
# data

In [38]:
size = 100000
gen_size = size*5

g_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)
kg_data = np.round(np.random.uniform(0.001, 999, gen_size), max_precision)
mg_data = np.round(np.random.uniform(0.01, 999, gen_size), max_precision)

generated_data = {
    'g': g_data,
    'kg': kg_data,
    'mg': mg_data
}

df = pd.DataFrame()

# Create a list to store the pair combinations
pair_combinations = [
    ('kg', 'g'),
    ('g', 'kg'),
    ('g', 'mg'),
    ('mg', 'g'),
]

value_to_multiple_by_unit = {
    "kg2g": 1000,
    "g2kg": 0.001,
    "g2mg": 1000,
    "mg2g": 0.001,
}

precision_mapping = {
    "kg2g": -3,
    "g2kg": 3,
    "g2mg": -3,
    "mg2g": 3,
}

for unit1, unit2 in pair_combinations:
    temp_data = []        
    for idx in range(size):        
        digit1 = np.round(generated_data[unit1][idx], max_precision)
        digit2 = np.round(generated_data[unit2][idx], max_precision)
        
        converted_digit1 = np.round(digit1 * value_to_multiple_by_unit[f"{unit1}2{unit2}"], max_precision + precision_mapping[f"{unit1}2{unit2}"])
        converted_digit2 = np.round(digit2 * value_to_multiple_by_unit[f"{unit2}2{unit1}"], max_precision + precision_mapping[f"{unit2}2{unit1}"])

        temp_data.append({"value1": f"{digit1}{unit1}", "operation": "=", "value2": f"{converted_digit1}{unit2}", "type": "weight"})
        temp_data.append({"value1": f"{digit2}{unit2}", "operation": "=", "value2": f"{converted_digit2}{unit1}", "type": "weight"})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2,type
0,397.34km,=,397340.0m,length
1,915.03m,=,0.91503km,length
2,710.07km,=,710070.0m,length
3,412.53m,=,0.41253km,length
4,457.9km,=,457900.0m,length
...,...,...,...,...
1999995,243.09g,=,243090.0mg,weight
1999996,311.64mg,=,0.31164g,weight
1999997,834.94g,=,834940.0mg,weight
1999998,477.22mg,=,0.47722g,weight


In [39]:
size = 100000
gen_size = size*5

l_data = np.round(np.random.uniform(0.001, 999, gen_size), max_precision)
ml_data = np.round(np.random.uniform(0.01, 9999, gen_size), max_precision)

generated_data = {
    "l": l_data,
    "ml": ml_data
}

df = pd.DataFrame()

# Create a list to store the pair combinations
pair_combinations = [
    ('l', 'ml'),
    ('ml', 'l'),
]

value_to_multiple_by_unit = {
    "l2ml": 1000,
    "ml2l": 0.001,
}

precision_mapping = {
    "l2ml": -3,
    "ml2l": 3,
}

for unit1, unit2 in pair_combinations:
    temp_data = []        
    for idx in range(size):        
        digit1 = np.round(generated_data[unit1][idx], max_precision)
        digit2 = np.round(generated_data[unit2][idx], max_precision)
        
        converted_digit1 = np.round(digit1 * value_to_multiple_by_unit[f"{unit1}2{unit2}"], max_precision + precision_mapping[f"{unit1}2{unit2}"])
        converted_digit2 = np.round(digit2 * value_to_multiple_by_unit[f"{unit2}2{unit1}"], max_precision + precision_mapping[f"{unit2}2{unit1}"])

        temp_data.append({"value1": f"{digit1}{unit1}", "operation": "=", "value2": f"{converted_digit1}{unit2}", "type": "volume"})
        temp_data.append({"value1": f"{digit2}{unit2}", "operation": "=", "value2": f"{converted_digit2}{unit1}", "type": "volume"})
    temp_df = pd.DataFrame(temp_data)
    data = pd.concat([data, temp_df], ignore_index=True)

data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2,type
0,397.34km,=,397340.0m,length
1,915.03m,=,0.91503km,length
2,710.07km,=,710070.0m,length
3,412.53m,=,0.41253km,length
4,457.9km,=,457900.0m,length
...,...,...,...,...
2399995,122.22l,=,122220.0ml,volume
2399996,6875.57ml,=,6.87557l,volume
2399997,359.0l,=,359000.0ml,volume
2399998,4407.55ml,=,4.40755l,volume


In [40]:
# # Generate km² data (float)
# km2_data = np.round(np.logspace(np.log10(0.000001), np.log10(1), num=gen_size), 6)

# # Generate corresponding m² data
# m2_data = np.round(km2_data * 1000000, 6)

# # Create a DataFrame
# df = pd.DataFrame()

# # Add km² and m² columns to the DataFrame
# df['km2'] = np.random.choice(km2_data, size=size)
# df['m2'] = np.round(df['km2'] * 1000000, 6)

# # Create a list to store the pair combinations
# pair_combinations = [
#     ('km2', 'm2')
# ]

# # Iterate over each pair combination
# for col1, col2 in pair_combinations:
#     temp_data = []
#     for idx, row in df.iterrows():
#         value1 = f"{row[col1]}{col1}"
#         value2 = f"{row[col2]}{col2}"
#         temp_data.append({"value1": value1, "operation": "=", "value2": value2})
#     temp_df = pd.DataFrame(temp_data)
#     data = pd.concat([data, temp_df], ignore_index=True)

# data.reset_index(drop=True, inplace=True)
# data

## Preprocess generated dataset

In [8]:
# # if there is "e" in the data, remove it

# delete_idx = []

# for idx, row in data.iterrows():
#     if 'e' in row['value1'] or 'e' in row['value2']:
#         delete_idx.append(idx)
        
# # Show me the rows that contain "e"
# data.loc[delete_idx]

Unnamed: 0,value1,operation,value2,type
6813,0.03m,=,3e-05km,length
26812,0.03m,=,3e-05km,length
123119,0.02g,=,2e-05kg,weight
127875,0.01g,=,1e-05kg,weight
143118,0.02g,=,2e-05kg,weight
147874,0.01g,=,1e-05kg,weight
178477,0.03mg,=,3e-05g,weight
198476,0.03mg,=,3e-05g,weight


In [9]:
# # drop the rows with "e" in the data

# data.drop(delete_idx, inplace=True)
# data.reset_index(drop=True, inplace=True)
# data

Unnamed: 0,value1,operation,value2,type
0,929.26km,=,929260.0m,length
1,840.19m,=,0.84019km,length
2,836.97km,=,836970.0m,length
3,567.26m,=,0.56726km,length
4,427.71km,=,427710.0m,length
...,...,...,...,...
239987,42.9l,=,42900.0ml,volume
239988,5108.06ml,=,5.10806l,volume
239989,93.86l,=,93860.0ml,volume
239990,2060.23ml,=,2.06023l,volume


In [41]:
# Categorize the digit
def categorize_digit(digit):
    if 0 <= digit < 1e-10:
        return "[0, 1e-10]"
    elif 1e-10 <= digit < 1e-9:
        return "[1e-10, 1e-9]"
    elif 1e-9 <= digit < 1e-8:
        return "[1e-9, 1e-8]"
    elif 1e-8 <= digit < 1e-7:
        return "[1e-8, 1e-7]"
    elif 1e-7 <= digit < 1e-6:
        return "[1e-7, 1e-6]"
    elif 1e-6 <= digit < 1e-5:
        return "[1e-6, 1e-5]"
    elif 1e-5 <= digit < 1e-4:
        return "[1e-5, 1e-4]"
    elif 1e-4 <= digit < 1e-3:
        return "[1e-4, 1e-3]"
    elif 1e-3 <= digit < 1e-2:
        return "[1e-3, 1e-2]"
    elif 1e-2 <= digit < 1e-1:
        return "[1e-2, 1e-1]"
    elif 1e-1 <= digit < 1:
        return "[1e-1, 1]"
    elif 1 <= digit < 10:
        return "[1, 10]"
    elif 10 <= digit < 100:
        return "[10, 100]"
    elif 100 <= digit < 1000:
        return "[100, 1000]"
    elif 1000 <= digit < 10000:
        return "[1000, 10000]"
    elif 10000 <= digit < 100000:
        return "[10000, 100000]"
    elif 100000 <= digit < 1000000:
        return "[100000, 1000000]"
    elif 1000000 <= digit < 1e7:
        return "[1000000, 1e7]"
    elif 1e7 <= digit < 1e8:
        return "[1e7, 1e8]"
    elif 1e8 <= digit < 1e9:
        return "[1e8, 1e9]"
    elif 1e9 <= digit < 1e10:
        return "[1e9, 1e10]"
    else:
        return "[1e10, ..]"

digit_range_count = OrderedDict([
    ("[0, 1e-10]", 0),
    ("[1e-10, 1e-9]", 0),
    ("[1e-9, 1e-8]", 0),
    ("[1e-8, 1e-7]", 0),
    ("[1e-7, 1e-6]", 0),
    ("[1e-6, 1e-5]", 0),
    ("[1e-5, 1e-4]", 0),
    ("[1e-4, 1e-3]", 0),
    ("[1e-3, 1e-2]", 0),
    ("[1e-2, 1e-1]", 0),
    ("[1e-1, 1]", 0),
    ("[1, 10]", 0),
    ("[10, 100]", 0),
    ("[100, 1000]", 0),
    ("[1000, 10000]", 0),
    ("[10000, 100000]", 0),
    ("[100000, 1000000]", 0),
    ("[1000000, 1e7]", 0),
    ("[1e7, 1e8]", 0),
    ("[1e8, 1e9]", 0),
    ("[1e9, 1e10]", 0),
    ("[1e10, ..]", 0)
])

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    for digit in [value1, value2]:
        category = categorize_digit(digit)
        digit_range_count[category] += 1
        
# Print the count for each digit range
for range_str, count in digit_range_count.items():
    print(f"{range_str}: {count}")

[0, 1e-10]: 0
[1e-10, 1e-9]: 0
[1e-9, 1e-8]: 0
[1e-8, 1e-7]: 0
[1e-7, 1e-6]: 0
[1e-6, 1e-5]: 0
[1e-5, 1e-4]: 0
[1e-4, 1e-3]: 634
[1e-3, 1e-2]: 5948
[1e-2, 1e-1]: 58088
[1e-1, 1]: 579484
[1, 10]: 398998
[10, 100]: 380846
[100, 1000]: 2018656
[1000, 10000]: 385500
[10000, 100000]: 251640
[100000, 1000000]: 720206
[1000000, 1e7]: 0
[1e7, 1e8]: 0
[1e8, 1e9]: 0
[1e9, 1e10]: 0
[1e10, ..]: 0


In [42]:
invalid_idx = []
max_digit_length = 5
for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    if len(str(value1).split(".")[0]) > max_digit_length or len(str(value2).split(".")[0]) > max_digit_length:
        invalid_idx.append(idx)
        
    if len(str(value1).split(".")[1]) > max_digit_length or len(str(value2).split(".")[1]) > max_digit_length:
        invalid_idx.append(idx)

data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2,type
0,397.34km,=,397340.0m,length
2,710.07km,=,710070.0m,length
4,457.9km,=,457900.0m,length
6,946.81km,=,946810.0m,length
8,186.71km,=,186710.0m,length
...,...,...,...,...
2399991,876.84l,=,876840.0ml,volume
2399993,125.79l,=,125790.0ml,volume
2399995,122.22l,=,122220.0ml,volume
2399997,359.0l,=,359000.0ml,volume


In [43]:
data.drop(invalid_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2,type
0,915.03m,=,0.91503km,length
1,412.53m,=,0.41253km,length
2,171.17m,=,0.17117km,length
3,135.56m,=,0.13556km,length
4,665.96m,=,0.66596km,length
...,...,...,...,...
1679789,6941.94ml,=,6.94194l,volume
1679790,3069.92ml,=,3.06992l,volume
1679791,8623.67ml,=,8.62367l,volume
1679792,6875.57ml,=,6.87557l,volume


In [28]:
# Delete the rows with the different value+unit

unit_conversions = {
    'kg': 1000,
    'g': 1,
    'mg': 0.001,
    'km': 1000,
    'm': 1,
    'cm': 0.01,
    'mm': 0.001,
    "l": 1,
    "ml": 0.001
}

invalid_idx = []
for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    converted_value1 = round(value1 * unit_conversions[unit1], 5)
    converted_value2 = round(value2 * unit_conversions[unit2], 5)
    
    if converted_value1 != converted_value2:
        invalid_idx.append(idx)
        
data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2,type
3780,0.08m,=,8e-05km,length
14934,0.02m,=,2e-05km,length
18419,0.07m,=,7e-05km,length
21779,0.02m,=,2e-05km,length
24320,0.02m,=,2e-05km,length
69882,0.05m,=,5e-05km,length
83381,0.08m,=,8e-05km,length
88286,0.02m,=,2e-05km,length
113795,0.08m,=,8e-05km,length
124949,0.02m,=,2e-05km,length


In [14]:
# data.drop(invalid_idx, inplace=True)
# data.reset_index(drop=True, inplace=True)
# data

Unnamed: 0,value1,operation,value2,type
0,840.19m,=,0.84019km,length
1,567.26m,=,0.56726km,length
2,579.33m,=,0.57933km,length
3,17.29km,=,17290.0m,length
4,581.73m,=,0.58173km,length
...,...,...,...,...
168071,8603.48ml,=,8.60348l,volume
168072,42.9l,=,42900.0ml,volume
168073,5108.06ml,=,5.10806l,volume
168074,93.86l,=,93860.0ml,volume


In [44]:
# delete duplicated rows

data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2,type
0,915.03m,=,0.91503km,length
1,412.53m,=,0.41253km,length
2,171.17m,=,0.17117km,length
3,135.56m,=,0.13556km,length
4,665.96m,=,0.66596km,length
...,...,...,...,...
562845,6941.94ml,=,6.94194l,volume
562846,3069.92ml,=,3.06992l,volume
562847,8623.67ml,=,8.62367l,volume
562848,6875.57ml,=,6.87557l,volume


In [45]:
# ratio of the data by type

data['type'].value_counts(normalize=False)

length    322250
weight    139079
volume    101521
Name: type, dtype: int64

In [46]:
# Delete half of the data where type is length

data_length = data[data['type'] == 'length']
data_weight = data[data['type'] == 'weight']
data_volume = data[data['type'] == 'volume']

# 20000 sample from the length data
data_length = data_length.sample(n=210000)
data_weight = data_weight.sample(n=139000)
data_volume = data_volume.sample(n=101000)
# data_length = data_length.sample(frac=0.5)
# data_length.reset_index(drop=True, inplace=True)

data = pd.concat([data_length, data_weight, data_volume], ignore_index=True)
data.reset_index(drop=True, inplace=True)
print(len(data))
data['type'].value_counts(normalize=True)

450000


length    0.466667
weight    0.308889
volume    0.224444
Name: type, dtype: float64

In [47]:
# output the data to a csv file

output_path = "/workspace/Data/NumericalNet/large_numericalnet_data_equal_v2.xlsx"

with pd.ExcelWriter(output_path) as writer:
    data.to_excel(writer, index=False)