In [1]:
import sys
sys.path.append("../")

import pandas as pd
from utils import get_digit_and_unit
from collections import OrderedDict

data_path = "/workspace/Data/NumericalNet/20240330_numericalnet_data.xlsx"
with pd.ExcelFile(data_path) as reader:
    equal_data = pd.read_excel(reader, sheet_name='equal')
    greater_data = pd.read_excel(reader, sheet_name='greater')
    less_data = pd.read_excel(reader, sheet_name='less')
    
data = pd.concat([equal_data, greater_data, less_data], ignore_index=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.29577km,=,295.77m
2,0.0026km,=,2.6m
3,0.00561km,=,5.61m
4,0.00362km,=,3.62m
...,...,...,...
1808425,0.46237l,<,637.48ml
1808426,0.83469l,<,1159.93ml
1808427,44.27414l,<,58299.49ml
1808428,2.89187l,<,4131.82ml


## Preprocess

In [6]:
# drop duplicated

data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.29577km,=,295.77m
2,0.0026km,=,2.6m
3,0.00561km,=,5.61m
4,0.00362km,=,3.62m
...,...,...,...
1808425,0.46237l,<,637.48ml
1808426,0.83469l,<,1159.93ml
1808427,44.27414l,<,58299.49ml
1808428,2.89187l,<,4131.82ml


In [7]:
# check the invalud cases

invalid_case1_idx = []
invalid_case2_idx = []

desired_unit_group = {
    "kg": "weight",
    "g": "weight",
    "mg": "weight",
    "km": "length",
    "m": "length",
    "cm": "length",
    "mm": "length",
    "l": "volume",
    "ml": "volume",
}

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    operation = row['operation']
    
    # Check for invalid case 1
    if operation == "=" and (unit1 == unit2):
        invalid_case1_idx.append(idx)
    
    # Check for invalid case 2
    if unit1 in desired_unit_group and unit2 in desired_unit_group:
        if desired_unit_group[unit1] != desired_unit_group[unit2]:
            invalid_case2_idx.append(idx)
            
# Show invalid case 1 results
print("Invalid Case 1:")
print(data.loc[invalid_case1_idx])

# Show invalid case 2 results
print("\nInvalid Case 2:")
print(data.loc[invalid_case2_idx])

Invalid Case 1:
Empty DataFrame
Columns: [value1, operation, value2]
Index: []

Invalid Case 2:
Empty DataFrame
Columns: [value1, operation, value2]
Index: []


In [8]:
data.drop(invalid_case1_idx + invalid_case2_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,0.07279km,=,72.79m
1,0.29577km,=,295.77m
2,0.0026km,=,2.6m
3,0.00561km,=,5.61m
4,0.00362km,=,3.62m
...,...,...,...
1808425,0.46237l,<,637.48ml
1808426,0.83469l,<,1159.93ml
1808427,44.27414l,<,58299.49ml
1808428,2.89187l,<,4131.82ml


## EDA

In [16]:
# Count the dataset for each unit
# Check the ratio of (kg, g, mg) (km, m, cm, mm) (l, ml)

unit_count = OrderedDict([
    ("kg", 0),
    ("g", 0),
    ("mg", 0),
    ("km", 0),
    ("m", 0),
    ("cm", 0),
    ("mm", 0),
    ("l", 0),
    ("ml", 0)
])

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    unit_count[unit1] += 1
    unit_count[unit2] += 1
    
# Print the count for each digit range
for unit, count in unit_count.items():
    print(f"{unit}: {count}")
    
# Print the count for the ratio of (kg, g, mg) (km, m, cm, mm) (l, ml)
print(f"kg, g, mg: {unit_count['kg'] + unit_count['g'] + unit_count['mg']}")
print(f"km, m, cm, mm: {unit_count['km'] + unit_count['m'] + unit_count['cm'] + unit_count['mm']}")
print(f"l, ml: {unit_count['l'] + unit_count['ml']}")

kg: 187278
g: 397248
mg: 256632
km: 173280
m: 924210
cm: 895710
mm: 468570
l: 156966
ml: 156966
kg, g, mg: 841158
km, m, cm, mm: 2461770
l, ml: 313932
