In [59]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import mean_absolute_error
from difflib import SequenceMatcher



def calculate_similarity(exp1, exp2):
    similarity = 0

    # Numeric field similarity (absolute difference)
    for field in ['yield_strength_value', "ultimate_tensile_strength_value", 'ductility_value', 'hardness_value', 'modulus_value']:
        val1 = exp1.get(field, "NA")
        val2 = exp2.get(field, "NA")
        
        # Convert to numeric if possible, otherwise treat "NA" as 0
        try:
            val1 = float(val1) if val1 != "NA" else 0.0
            val2 = float(val2) if val2 != "NA" else 0.0
        except ValueError:
            val1 = val2 = 0.0  # Fallback for unexpected non-numeric strings
        
        similarity -= abs(val1 - val2)  # Penalize based on difference

    # Text field similarity (SequenceMatcher)
    notes1 = exp1.get("notes", "")
    notes2 = exp2.get("notes", "")
    similarity += SequenceMatcher(None, notes1, notes2).ratio()  # Add similarity score

    return similarity

def rmse(gt_values, ext_values):
    return np.sqrt(np.mean((np.array(gt_values) - np.array(ext_values)) ** 2))

def calculate_performance(ground_truth, extracted, penalty=1.0):
    num_ground_truth = len(ground_truth)
    num_extracted = len(extracted)
    similarity_matrix = np.zeros((num_ground_truth, num_extracted))

    # Calculate similarity matrix
    for i, gt in enumerate(ground_truth):
        for j, ext in enumerate(extracted):
            similarity_matrix[i, j] = calculate_similarity(gt, ext)

    # Optimal matching using Hungarian Algorithm
    row_ind, col_ind = linear_sum_assignment(-similarity_matrix)  # Maximize similarity

    # Create matched pairs and identify unmatched
    matched_pairs = []
    matched_gt = set(row_ind)
    matched_ext = set(col_ind)
    
    for i, j in zip(row_ind, col_ind):
        matched_pairs.append((ground_truth[i], extracted[j]))

    unmatched_gt = set(range(num_ground_truth)) - matched_gt
    unmatched_ext = set(range(num_extracted)) - matched_ext

    # Calculate individual performance metrics (normalized RMSE)
    field_rmse = {}
    normalized_losses = []

    for field in ['yield_strength_value', "ultimate_tensile_strength_value", 'ductility_value', 'hardness_value', 'modulus_value']:
        gt_values = []
        ext_values = []

        # Process matched pairs only
        for gt, ext in matched_pairs:
            gt_val = gt.get(field, "NA")
            ext_val = ext.get(field, "NA")
            # Convert "NA" or invalid types to 0
            gt_val = float(gt_val) if isinstance(gt_val, (int, float)) else 0
            ext_val = float(ext_val) if isinstance(ext_val, (int, float)) else 0
            gt_values.append(gt_val)
            ext_values.append(ext_val)

        # Compute RMSE and normalize
        if gt_values and ext_values:
            error = rmse(gt_values, ext_values)
            max_gt_value = max(gt_values) if gt_values else 1  # Avoid division by 0
            normalized_loss = error / max_gt_value if max_gt_value != 0 else 0
            field_rmse[field] = normalized_loss
            normalized_losses.append(normalized_loss)

    # Calculate overall loss
    avg_normalized_loss = np.mean(normalized_losses) if normalized_losses else 0
    percent_error = abs(num_extracted - num_ground_truth) / num_ground_truth if num_ground_truth > 0 else 0
    overall_loss = avg_normalized_loss + (percent_error * penalty)

    # Display Results
    results = {
        "Field RMSE": field_rmse,  # Normalized individual metrics
        "Overall Loss": overall_loss,  # Penalized overall performance
        "Similarity Matrix": similarity_matrix,
        "Unmatched Ground Truth": unmatched_gt,
        "Unmatched Extracted": unmatched_ext
    }
    return results


In [60]:
ground_truth = [
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 70,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": "NA",
        "ductility_units": "NA",
        "hardness_value": 85,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from as-atomized (naturally aged) Al6061 powder at Tgas=500°C and pgas=3 MPa"
    },
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 107,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": "NA",
        "ductility_units": "NA",
        "hardness_value": 69,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from solution-annealed Al6061 powder at Tgas=500°C and pgas=3 MPa"
    }
]

extracted = [
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 70,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": "NA",
        "ductility_units": "NA",
        "hardness_value": 85,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from as-atomized (naturally aged) Al6061 powder at Tgas=500°C and pgas=3 MPa"
    },
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 107,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": "NA",
        "ductility_units": "NA",
        "hardness_value": 69,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from solution-annealed Al6061 powder at Tgas=500°C and pgas=3 MPa"
    }
]

results = calculate_performance(ground_truth, extracted)

for i in results:
    print(i)
    print(results[i])
    print('-'*30)

Field RMSE
{'yield_strength_value': 0, 'ultimate_tensile_strength_value': np.float64(0.0), 'ductility_value': 0, 'hardness_value': np.float64(0.0), 'modulus_value': 0}
------------------------------
Overall Loss
0.0
------------------------------
Similarity Matrix
[[  1.         -52.19553073]
 [-52.16201117   1.        ]]
------------------------------
Unmatched Ground Truth
set()
------------------------------
Unmatched Extracted
set()
------------------------------


In [61]:

# Example Ground Truth and Extracted Data
ground_truth = [
    {
        "yield_strength_value": 114,
        "yield_strength_units": "MPA",
        "ultimate_tensile_strength_value": 70,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 3,
        "ductility_units": "%",
        "hardness_value": 85,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from as-atomized (naturally aged) Al6061 powder at Tgas=500°C and pgas=3 MPa"
    },
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 107,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 5,
        "ductility_units": "%",
        "hardness_value": 69,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from solution-annealed Al6061 powder at Tgas=500°C and pgas=3 MPa"
    }
]

extracted = [
    {
        "yield_strength_value": 112,
        "yield_strength_units": "MPA",
        "ultimate_tensile_strength_value": 70,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 3.1,
        "ductility_units": "%",
        "hardness_value": 85,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from as-atomized (naturally aged) Al6061 powder at Tgas=500°C and pgas=3 MPa"
    },
    {
        "yield_strength_value": "NA",
        "yield_strength_units": "NA",
        "ultimate_tensile_strength_value": 107,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 5,
        "ductility_units": "%",
        "hardness_value": 69,
        "hardness_units": "HV",
        "modulus_value": "NA",
        "modulus_units": "NA",
        "notes": "As-sprayed deposit from solution-annealed Al6061 powder at Tgas=500°C and pgas=3 MPa"
    }
]
results = calculate_performance(ground_truth, extracted)

for i in results:
    print(i)
    print(results[i])
    print('-'*30)

Field RMSE
{'yield_strength_value': np.float64(0.012405382126079782), 'ultimate_tensile_strength_value': np.float64(0.0), 'ductility_value': np.float64(0.014142135623730965), 'hardness_value': np.float64(0.0), 'modulus_value': 0}
------------------------------
Overall Loss
0.00530950354996215
------------------------------
Similarity Matrix
[[  -1.1        -168.19553073]
 [-166.06201117    1.        ]]
------------------------------
Unmatched Ground Truth
set()
------------------------------
Unmatched Extracted
set()
------------------------------


In [62]:
ground_truth = [
    {
        "yield_strength_value": 250.3,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 268.2,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 1.2,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 54.5,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set I (longitudinal); powder exposed to humid air"
    },
    {
        "yield_strength_value": 251.2,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 271.3,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 1.0,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 55.3,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set I (transverse); powder exposed to humid air"
    },
    {
        "yield_strength_value": 230.6,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 292.3,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 6.9,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 71.0,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set II (longitudinal); no humid air exposure"
    },
    {
        "yield_strength_value": 231.0,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 289.6,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 5.7,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 70.3,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set II (transverse); no humid air exposure"
    },
    {
        "yield_strength_value": 249.5,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 289.6,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 3.9,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 67.5,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set III (longitudinal)"
    }
]

extracted = [
    {
        "yield_strength_value": 250.3,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 268.2,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 1.2,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 54.5,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set I (longitudinal); powder exposed to humid air"
    },
    {
        "yield_strength_value": 251.2,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 271.3,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 1.0,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 55.3,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set I (transverse); powder exposed to humid air"
    },
    {
        "yield_strength_value": 230.6,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 292.3,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 6.9,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 71.0,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set II (longitudinal); no humid air exposure"
    },
    {
        "yield_strength_value": 231.0,
        "yield_strength_units": "MPa",
        "ultimate_tensile_strength_value": 289.6,
        "ultimate_tensile_strength_units": "MPa",
        "ductility_value": 5.7,
        "ductility_units": "%",
        "hardness_value": "NA",
        "hardness_units": "NA",
        "modulus_value": 70.3,
        "modulus_units": "GPa",
        "notes": "As-sprayed AL 6061 from processing set II (transverse); no humid air exposure"
    }
]

results = calculate_performance(ground_truth, extracted)

for i in results:
    print(i)
    print(results[i])
    print('-'*30)

Field RMSE
{'yield_strength_value': np.float64(0.0), 'ultimate_tensile_strength_value': np.float64(0.0), 'ductility_value': np.float64(0.0), 'hardness_value': 0, 'modulus_value': np.float64(0.0)}
------------------------------
Overall Loss
0.2
------------------------------
Similarity Matrix
[[  1.          -4.12048193 -65.16564417 -60.29192547]
 [ -4.10843373   1.         -62.47950311 -57.36981132]
 [-65.16564417 -62.49192547   1.          -4.12820513]
 [-60.27950311 -57.36981132  -4.11538462   1.        ]
 [-37.11985816 -34.4381295  -27.27647059 -22.41343284]]
------------------------------
Unmatched Ground Truth
{4}
------------------------------
Unmatched Extracted
set()
------------------------------
