In [55]:
import sys
sys.path.append("../")

import os
import pandas as pd
from collections import OrderedDict
from utils import get_digit_and_unit

base_path = "/workspace/Data/NumericalNet"
data_fn = "20240327_numericalnet_data_from_claude_raw.xlsx"

In [49]:
raw_data = pd.read_excel(os.path.join(base_path, data_fn), engine='openpyxl', header=None)
raw_data.columns = ["data"]
raw_data

Unnamed: 0,data
0,data
1,"[5.3kg = 5300g, 9.8m = 980cm, 0.058km = 58m, 2..."
2,"[4.5kg = 4500g, 8.2m = 820cm, 0.055km = 55m, 1..."
3,"[5.7kg = 5700g, 9.5m = 950cm, 0.061km = 61m, 2..."
4,"[4.8kg = 4800g, 7.6m = 760cm, 0.056km = 56m, 1..."
...,...
507,"[1.2345kg = 1234.5g, 0.0123m = 1.23cm, 0.0456k..."
508,"[12.3456kg = 12345.6g, 0.00123m = 0.123cm, 0.0..."
509,"[321.098kg = 321098g, 65.4321m = 6543.21cm, 0...."
510,"[6543210g, 12.3456m = 1234.56cm, 0.00123km = 1..."


In [50]:
temp_data = []
for idx, row in raw_data.iterrows():
    temp_row = row[0][1:-1].split(", ")
    temp_data.extend(temp_row)

data = []
for item in temp_data:
    
    if "=" in item:
        parts = item.split("=")
        data.append([parts[0].strip(), "=", parts[1].strip()])
        data.append([parts[1].strip(), "=", parts[0].strip()])
    elif ">" in item:
        parts = item.split(">")
        data.append([parts[0].strip(), ">", parts[1].strip()])
        data.append([parts[1].strip(), "<", parts[0].strip()])
    elif "<" in item:
        parts = item.split("<")
        data.append([parts[0].strip(), "<", parts[1].strip()])
        data.append([parts[1].strip(), ">", parts[0].strip()])

data = pd.DataFrame(data, columns=["value1", "operation", "value2"])
data

Unnamed: 0,value1,operation,value2
0,5.3kg,=,5300g
1,5300g,=,5.3kg
2,9.8m,=,980cm
3,980cm,=,9.8m
4,0.058km,=,58m
...,...,...,...
10325,0.0003210987l,=,0.3210987ml
10326,0.2109876ml,=,0.0002109876l
10327,0.0002109876l,=,0.2109876ml
10328,0.1098765ml,=,0.0001098765l


## Preprocess

### Invalid Case 1.
    unit은 같은데 값이 다른 경우

In [51]:
invalid_idx = []
max_digit_length = 5
for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    operation = row['operation']
    
    if operation == "=" and (unit1 == unit2):
        invalid_idx.append(idx)

data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2
2404,0.052m,=,52m
2405,52m,=,0.052m
2414,0.037m,=,37m
2415,37m,=,0.037m
2424,0.077m,=,77m
2425,77m,=,0.077m
2434,0.042m,=,42m
2435,42m,=,0.042m
2444,0.033m,=,33m
2445,33m,=,0.033m


In [52]:
data.drop(invalid_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,5.3kg,=,5300g
1,5300g,=,5.3kg
2,9.8m,=,980cm
3,980cm,=,9.8m
4,0.058km,=,58m
...,...,...,...
10303,0.0003210987l,=,0.3210987ml
10304,0.2109876ml,=,0.0002109876l
10305,0.0002109876l,=,0.2109876ml
10306,0.1098765ml,=,0.0001098765l


### Invalid Case2
    서로 다른 단위가 비교대상에 있는 경우

In [53]:
invalid_idx = []
desired_unit_group = {
    "kg": "weight",
    "g": "weight",
    "mg": "weight",
    "km": "length",
    "m": "length",
    "cm": "length",
    "mm": "length",
    "l": "volume",
    "ml": "volume",
}

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    if desired_unit_group[unit1] != desired_unit_group[unit2]:
        invalid_idx.append(idx)

data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2
1504,0.053kg,=,53m
1505,53m,=,0.053kg
1514,0.058kg,=,58m
1515,58m,=,0.058kg
1524,0.032kg,=,32m
1525,32m,=,0.032kg
1534,0.07kg,=,70m
1535,70m,=,0.07kg
1544,0.035kg,=,35m
1545,35m,=,0.035kg


In [54]:
data.drop(invalid_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,5.3kg,=,5300g
1,5300g,=,5.3kg
2,9.8m,=,980cm
3,980cm,=,9.8m
4,0.058km,=,58m
...,...,...,...
10243,0.0003210987l,=,0.3210987ml
10244,0.2109876ml,=,0.0002109876l
10245,0.0002109876l,=,0.2109876ml
10246,0.1098765ml,=,0.0001098765l


### Invalid Case 3
    너무 긴 소수점이나 너무 큰 숫자는 제거

In [37]:
def categorize_digit(digit):
    if 0 <= digit < 1e-10:
        return "[0, 1e-10]"
    elif 1e-10 <= digit < 1e-9:
        return "[1e-10, 1e-9]"
    elif 1e-9 <= digit < 1e-8:
        return "[1e-9, 1e-8]"
    elif 1e-8 <= digit < 1e-7:
        return "[1e-8, 1e-7]"
    elif 1e-7 <= digit < 1e-6:
        return "[1e-7, 1e-6]"
    elif 1e-6 <= digit < 1e-5:
        return "[1e-6, 1e-5]"
    elif 1e-5 <= digit < 1e-4:
        return "[1e-5, 1e-4]"
    elif 1e-4 <= digit < 1e-3:
        return "[1e-4, 1e-3]"
    elif 1e-3 <= digit < 1e-2:
        return "[1e-3, 1e-2]"
    elif 1e-2 <= digit < 1e-1:
        return "[1e-2, 1e-1]"
    elif 1e-1 <= digit < 1:
        return "[1e-1, 1]"
    elif 1 <= digit < 10:
        return "[1, 10]"
    elif 10 <= digit < 100:
        return "[10, 100]"
    elif 100 <= digit < 1000:
        return "[100, 1000]"
    elif 1000 <= digit < 10000:
        return "[1000, 10000]"
    elif 10000 <= digit < 100000:
        return "[10000, 100000]"
    elif 100000 <= digit < 1000000:
        return "[100000, 1000000]"
    elif 1000000 <= digit < 1e7:
        return "[1000000, 1e7]"
    elif 1e7 <= digit < 1e8:
        return "[1e7, 1e8]"
    elif 1e8 <= digit < 1e9:
        return "[1e8, 1e9]"
    elif 1e9 <= digit < 1e10:
        return "[1e9, 1e10]"
    else:
        return "[1e10, ..]"

digit_range_count = OrderedDict([
    ("[0, 1e-10]", 0),
    ("[1e-10, 1e-9]", 0),
    ("[1e-9, 1e-8]", 0),
    ("[1e-8, 1e-7]", 0),
    ("[1e-7, 1e-6]", 0),
    ("[1e-6, 1e-5]", 0),
    ("[1e-5, 1e-4]", 0),
    ("[1e-4, 1e-3]", 0),
    ("[1e-3, 1e-2]", 0),
    ("[1e-2, 1e-1]", 0),
    ("[1e-1, 1]", 0),
    ("[1, 10]", 0),
    ("[10, 100]", 0),
    ("[100, 1000]", 0),
    ("[1000, 10000]", 0),
    ("[10000, 100000]", 0),
    ("[100000, 1000000]", 0),
    ("[1000000, 1e7]", 0),
    ("[1e7, 1e8]", 0),
    ("[1e8, 1e9]", 0),
    ("[1e9, 1e10]", 0),
    ("[1e10, ..]", 0)
])

for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    for digit in [value1, value2]:
        category = categorize_digit(digit)
        digit_range_count[category] += 1
        
# Print the count for each digit range
for range_str, count in digit_range_count.items():
    print(f"{range_str}: {count}")

[0, 1e-10]: 0
[1e-10, 1e-9]: 0
[1e-9, 1e-8]: 0
[1e-8, 1e-7]: 0
[1e-7, 1e-6]: 0
[1e-6, 1e-5]: 0
[1e-5, 1e-4]: 0
[1e-4, 1e-3]: 148
[1e-3, 1e-2]: 614
[1e-2, 1e-1]: 1544
[1e-1, 1]: 1180
[1, 10]: 6296
[10, 100]: 3302
[100, 1000]: 2136
[1000, 10000]: 3910
[10000, 100000]: 544
[100000, 1000000]: 368
[1000000, 1e7]: 454
[1e7, 1e8]: 0
[1e8, 1e9]: 0
[1e9, 1e10]: 0
[1e10, ..]: 0


In [39]:
invalid_idx = []
max_digit_length = 5
for idx, row in data.iterrows():
    value1, unit1 = get_digit_and_unit(row['value1'])
    value2, unit2 = get_digit_and_unit(row['value2'])
    
    if len(str(value1).split(".")[0]) > max_digit_length or len(str(value2).split(".")[0]) > max_digit_length:
        invalid_idx.append(idx)
        
    if len(str(value1).split(".")[1]) > max_digit_length or len(str(value2).split(".")[1]) > max_digit_length:
        invalid_idx.append(idx)

data.loc[invalid_idx]

Unnamed: 0,value1,operation,value2
3834,5.874km,=,5874000mm
3835,5874000mm,=,5.874km
3838,7.621g,=,0.007621kg
3839,0.007621kg,=,7.621g
3850,1.093km,=,1093000mm
...,...,...,...
10243,0.0003210987l,=,0.3210987ml
10244,0.2109876ml,=,0.0002109876l
10245,0.0002109876l,=,0.2109876ml
10246,0.1098765ml,=,0.0001098765l


In [40]:
data.drop(invalid_idx, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,5.3kg,=,5300g
1,5300g,=,5.3kg
2,9.8m,=,980cm
3,980cm,=,9.8m
4,0.058km,=,58m
...,...,...,...
8683,45.6789l,=,45678.9ml
8684,90123.4ml,=,90.1234l
8685,90.1234l,=,90123.4ml
8686,6543.21ml,=,6.54321l


### Invalid Case4
    중복제거

In [41]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,value1,operation,value2
0,5.3kg,=,5300g
1,5300g,=,5.3kg
2,9.8m,=,980cm
3,980cm,=,9.8m
4,0.058km,=,58m
...,...,...,...
3271,32.1098l,=,32109.8ml
3272,43210.9ml,=,43.2109l
3273,43.2109l,=,43210.9ml
3274,6543.21ml,=,6.54321l


## Save the data

In [42]:
equal_data = data[data["operation"] == "="].reset_index(drop=True)
greater_data = data[data["operation"] == ">"].reset_index(drop=True)
less_data = data[data["operation"] == "<"].reset_index(drop=True)

print("Equal: ", equal_data.shape[0])
print("Greater: ", greater_data.shape[0])
print("Less: ", less_data.shape[0])

Equal:  2006
Greater:  635
Less:  635


In [46]:
# Save the data into excel file with different sheets name: equal, greater, less

output_path = "/workspace/Data/NumericalNet/20240331_numericalnet_data_from_claude.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with pd.ExcelWriter(output_path) as writer:
    data.to_excel(writer, sheet_name='equal', index=False, engine='openpyxl')
    greater_data.to_excel(writer, sheet_name='greater', index=False, engine='openpyxl')
    less_data.to_excel(writer, sheet_name='less', index=False, engine='openpyxl')