In [44]:
# Append the parent directory to sys.path
import sys
sys.path.append("..")

In [45]:
# Python Imports
import os

# External Library Imports
import pandas as pd

# SciKGExtract Utilities Imports
from scikg_extract.utils.dict_utils import get_value_by_path
from scikg_extract.utils.file_utils import read_json_file

# SciKGExtract Config Imports
from scikg_extract.config.normalization.normalizationConfig import NormalizationConfig

In [46]:
def normalize_property_path(property_path: str) -> str:
    """
    Normalize the property path by removing wildcards and standardizing the format.
    Args:
        property_path (str): The original property path with potential wildcards.
    Returns:
        str: The normalized property path.
    """
    # Remove all special characters except for alphanumerics and dots
    normalized_path = ''.join(char for char in property_path if char.isalnum() or char == '.')
    
    # Return the normalized property path
    return normalized_path

In [47]:
def update_dictionary_with_value_cid(property_value_cid_mapping: dict, data_dict: dict) -> dict:
    """
    Update the data dictionary containing property values with their corresponding CIDs with the new mappings provided.
    Args:
        property_value_cid_mapping (dict): A dictionary mapping property paths to their value-CID pairs.
        data_dict (dict): The new mapping of property paths to value-CID pairs to be integrated.
    Returns:
        dict: The updated property value-CID mapping dictionary.
    """

    # Extract the property path and value-CID pair from the new data dictionary
    property_path, value_cid_pair = data_dict.popitem()

    # Check if the value is present in the value-CID pair
    if "value" not in value_cid_pair: 
        print(f"Value not found for property path: {property_path}. Skipping update.")
        return property_value_cid_mapping
    
    # Extract value and CID from the value-CID pair
    value, cid = value_cid_pair.values()

    # Format the value to string to be used as a key
    value = str(value)

    # Skip if the value is 'Not Found' or empty
    if value in ["Not Found", ""]:
        return property_value_cid_mapping

    # Check if the property path exists in the existing mapping, if not, add it directly
    if property_path not in property_value_cid_mapping:
        property_value_cid_mapping[property_path] = [{"value": value, "sameAs": cid}]
        return property_value_cid_mapping

    # Check if the value already exists for the property path
    if not any(entry["value"] == value for entry in property_value_cid_mapping[property_path]):
        property_value_cid_mapping[property_path].append({"value": value, "sameAs": cid})
        return property_value_cid_mapping
    
    # If the value exists, extend the CID list with new CIDs
    for entry in property_value_cid_mapping[property_path]:
        # Skip if the value does not match
        if entry["value"] != value: continue

        # Extend the CID list and remove duplicates
        entry["sameAs"].extend(cid)
        entry["sameAs"] = list(set(entry["sameAs"]))

    # Return the updated mapping
    return property_value_cid_mapping

In [48]:
def extract_property_value_cid_mapping(data: dict, property_value_cid_mapping: dict, property_names: list[str] = NormalizationConfig.include_paths) -> dict[str, str]:
    """
    Extracts a mapping of property names to their corresponding CID values from the given data dictionary.
    Args:
        data (dict): The extracted structured knowledge data containing normalized values.
        property_value_cid_mapping (dict): The existing property to CID mapping dictionary to be updated.
        property_names (list[str], optional): List of property names to extract. Defaults to NormalizationConfig.include_paths.
    Returns:
        dict[str, str]: A dictionary mapping property names to their CID values.
    """

    # Iterate over each property name containing the normalized values
    for property_name in property_names:

        # Get the CID values using the property path
        cid_values = get_value_by_path(data, property_name)
        
        # Check if the list of CID values is not empty
        if not cid_values or all(value is None for value, _ in cid_values):
            print(f"No CID values found for property: {property_name}")
            continue

        # Normalize the property path
        normalized_property_name = normalize_property_path(property_name)

        # Accumulate the CID values for the property with the corresponding property value
        for normalized_value, _ in cid_values:
            property_value_cid_mapping = update_dictionary_with_value_cid(property_value_cid_mapping, {normalized_property_name: normalized_value})
        
    # Return the final property to CID mapping dictionary
    return property_value_cid_mapping

In [49]:
def create_property_cid_dataframe(property_value_cid_mapping: dict) -> pd.DataFrame:
    """
    Create a MultiIndex DataFrame representing the property value-CID mapping.
    Args:
        property_value_cid_mapping (dict): A dictionary mapping property names to their value-CID pairs.
    Returns:
        pd.DataFrame: A MultiIndex DataFrame with property values and their corresponding CIDs.
    """

    # Create a MultiIndex DataFrame to represent the property value-CID mapping
    columns_level1 = list(property_value_cid_mapping.keys())
    columns_level2 = ["value", "sameAs"]

    # Create MultiIndex for DataFrame columns
    columns = pd.MultiIndex.from_product([columns_level1, columns_level2])

    # Prepare data for DataFrame
    df_data = []
    for _, value_cid_list in property_value_cid_mapping.items():
        # Initialize lists to hold values and CIDs
        value_list, cid_list = [], []
        
        # Extract values and CIDs
        for entry in value_cid_list:
            value_list.append(entry["value"])
            cid_list.append(", ".join(sameAs.split("/")[-1] for sameAs in entry["sameAs"]))

        # Sort the value and cid lists based on values
        sorted_pairs = sorted(zip(value_list, cid_list), key=lambda x: x[0])
        value_list, cid_list = zip(*sorted_pairs)

        # Append to DataFrame data
        df_data.append(list(value_list))
        df_data.append(list(cid_list))

    # Get maximum length of value lists to pad shorter lists
    max_length = max(len(lst) for lst in df_data)

    # Pad shorter lists with None to ensure equal length
    for i in range(len(df_data)):
        while len(df_data[i]) < max_length:
            df_data[i].append("")

    # Transpose the data to match the DataFrame structure
    df_data = list(map(list, zip(*df_data)))

    # Create the DataFrame
    property_cid_df = pd.DataFrame(df_data, columns=columns)

    # Return the constructed DataFrame
    return property_cid_df

In [50]:
def construct_normalization_summary(extracted_data_path: str, llm_model: str) -> pd.DataFrame:
    """
    Construct a normalization summary DataFrame from the extracted data.
    Args:
        extracted_data_path (str) : Path to the extracted data JSON file.
        llm_model (str): The LLM model used for extraction.
    Returns:
        pd.DataFrame: A DataFrame summarizing the normalization results.
    """

    # Initialize an empty dictionary to hold the property to CID mapping
    property_value_cid_mapping = {}

    # Iterate over all files in the extracted data directory
    for root, _, files in os.walk(extracted_data_path):
        
        # Skip if no files found
        if not files: continue

        # Check if the current directory corresponds to the specified LLM model
        _, llm = os.path.split(root)
        if llm != llm_model: continue

        # Process each file in the directory
        for file in files:
            # Construct the full file path
            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")

            # Read the extracted data from the JSON file
            extracted_data = read_json_file(file_path)

            # Iterate over each process in the extracted data
            for process in extracted_data.get("processes", []):
                # Extract and update the property value-CID mapping
                property_value_cid_mapping = extract_property_value_cid_mapping(process, property_value_cid_mapping)

    # Create the normalization summary DataFrame
    property_cid_df = create_property_cid_dataframe(property_value_cid_mapping)

    # Return the normalization summary DataFrame
    return property_cid_df

In [None]:
# LLM model whose extracted data is to be analyzed
llm_model = "gpt-4o"

# Path to the extracted data directory
extracted_data_path = "../results/extracted-data-test/ALD/version4/ZnO-IGZO-papers/experimental-usecase/IGZO"

# Construct the normalization summary DataFrame
property_cid_df = construct_normalization_summary(extracted_data_path, llm_model)

In [52]:
property_cid_df.head()

Unnamed: 0_level_0,aldSystem.aldMethod.compound,aldSystem.aldMethod.compound,aldSystem.materialDeposited,aldSystem.materialDeposited,reactantSelection.precursor.compound,reactantSelection.precursor.compound,reactantSelection.precursor.precursor,reactantSelection.precursor.precursor,reactantSelection.coReactant.compound,reactantSelection.coReactant.compound,reactantSelection.coReactant.coReactant,reactantSelection.coReactant.coReactant,reactantSelection.carrierGas,reactantSelection.carrierGas,reactantSelection.purgingGas,reactantSelection.purgingGas,processParameters.substrate,processParameters.substrate
Unnamed: 0_level_1,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs
0,AZO,3034285,Al2O3,9989226,Al2O3,9989226,(3-(dimethylamino)propyl)dimethylindium (DADI),"11470276, 7069",Al2O3,9989226,Ar + O2 mixed plasma,,Ar,23968.0,Ar,23968,100-nm-thick SiO2/Si substrates,
1,Al2O3,9989226,HfO2/Al2O3,,AlOx,,(3-Dimethylaminopropyl)-dimethyl indium (DADI),"11470276, 7069",AlOx,,Ar/O2 plasma,,Ar/O2,,Argon,23968,300 nm SiO2/p++ Si substrate,
2,Ga2O3,"5139834, 158605",IGZO,,Ga,5360835,(3-Dimethylaminopropyl)dimethylindium (DADI),"11470276, 7069",Ga2O3,"5139834, 158605",H2O,962.0,Argon,23968.0,Argon (Ar),23968,4-in. n-type Si(100) wafers with a native oxid...,24261.0
3,GaOx,"175760063, 6336273",IZO/IGZO,,Ga2O3,"5139834, 158605",(3-dimethylamimopropryl)-dimethyl indium (DADI),"11470276, 7069",GaOx,"175760063, 6336273",MeOH,887.0,Argon (Ar),23968.0,N2,947,8-in thermally oxidated p+ Si substrate,
4,GaZnO,"175868391, 157835842",InGaZnO,"173033675, 175694768, 175809223",GaO,"175760063, 6336273",(3-dimethylamimopropryl)dimethyl indium (DADI),"11470276, 7069",GaZnO,"175868391, 157835842",N2O plasma,948.0,N2,947.0,Nitrogen,947,Heavily doped p-type silicon substrate with a ...,


In [53]:
# Path to save the normalization summary Excel file
output_excel_path = f"../results/statistics/ALD/version2/ZnO-IGZO-papers/experimental-usecase/IGZO"

# Create subdirectories if they do not exist
os.makedirs(output_excel_path, exist_ok=True)

# Filename for the normalization summary Excel file
filename = f"normalization_summary_{llm_model}.xlsx"

# Export the DataFrame to an Excel file
property_cid_df.to_excel(os.path.join(output_excel_path, filename))

In [None]:
# LLM model whose extracted data is to be analyzed
llm_model = "gpt-5-mini"

# Path to the extracted data directory
extracted_data_path = "../results/extracted-data-test/ALD/version4/ZnO-IGZO-papers/experimental-usecase/IGZO"

# Construct the normalization summary DataFrame
property_cid_df = construct_normalization_summary(extracted_data_path, llm_model)

In [55]:
property_cid_df.head()

Unnamed: 0_level_0,aldSystem.aldMethod.compound,aldSystem.aldMethod.compound,aldSystem.materialDeposited,aldSystem.materialDeposited,reactantSelection.precursor.compound,reactantSelection.precursor.compound,reactantSelection.precursor.precursor,reactantSelection.precursor.precursor,reactantSelection.coReactant.compound,reactantSelection.coReactant.compound,reactantSelection.coReactant.coReactant,reactantSelection.coReactant.coReactant,reactantSelection.purgingGas,reactantSelection.purgingGas,processParameters.substrate,processParameters.substrate,reactantSelection.carrierGas,reactantSelection.carrierGas
Unnamed: 0_level_1,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs
0,AZO,3034285.0,AO-IGZO (InGaZnO),,AZO,3034285.0,(3(dimethylamino)propyl)dimethylindium (DADI),,AZO,3034285.0,Ar/O2 plasma,,Ar,23968,100 nm thermally grown SiO2 on poly-Si substrate,24261.0,Ar,23968.0
1,AZO (Al-doped ZnO),,AZO (Al-doped ZnO),,Al,5359268.0,(3-(dimethylamino)propyl)dimethylindium (DADI),"11470276, 7069",Al,5359268.0,Ar/O2 plasma (oxygen radicals),977.0,Argon,23968,100-nm thermally grown SiO2 substrate,24261.0,Ar (99.999%),23968.0
2,Al2O3,9989226.0,Al2O3,9989226,Al2O3,9989226.0,(3-Dimethylaminopropyl)-dimethyl indium (DADI),"11470276, 7069",Al2O3,9989226.0,H2O,962.0,Argon (Ar),23968,100-nm-thick SiO2/Si substrates,,Ar/O2,
3,AlOx,,HfO2,"292779, 159422",AlOx,,(3-Dimethylaminopropyl)dimethyl indium (DADI),,Al2O3_protection_layer,,H2O (water),962.0,N2,947,150-nm-thick indium tin oxide (ITO)-coated gla...,,Argon,23968.0
4,Ga,5360835.0,IGO,165416128,Ga,5360835.0,(3-dimethylamimopropryl)-dimethyl indium (DADI),"11470276, 7069",Al2O3_spacer,9989226.0,H2O and O3,,N2 (99.999%),947,300 mm silicon wafer,5461123.0,Argon (Ar),23968.0


In [56]:
# Path to save the normalization summary Excel file
output_excel_path = f"../results/statistics/ALD/version2/ZnO-IGZO-papers/experimental-usecase/IGZO"

# Create subdirectories if they do not exist
os.makedirs(output_excel_path, exist_ok=True)

# Filename for the normalization summary Excel file
filename = f"normalization_summary_{llm_model}.xlsx"

# Export the DataFrame to an Excel file
property_cid_df.to_excel(os.path.join(output_excel_path, filename))

In [None]:
# LLM model whose extracted data is to be analyzed
llm_model = "gpt-4o"

# Path to the extracted data directory
extracted_data_path = "../results/extracted-data-test/ALD/version4/ZnO-IGZO-papers/experimental-usecase/ZnO"

# Construct the normalization summary DataFrame
property_cid_df = construct_normalization_summary(extracted_data_path, llm_model)

In [58]:
property_cid_df.head()

Unnamed: 0_level_0,aldSystem.aldMethod.compound,aldSystem.aldMethod.compound,aldSystem.materialDeposited,aldSystem.materialDeposited,reactantSelection.precursor.compound,reactantSelection.precursor.compound,reactantSelection.precursor.precursor,reactantSelection.precursor.precursor,reactantSelection.coReactant.compound,reactantSelection.coReactant.compound,reactantSelection.coReactant.coReactant,reactantSelection.coReactant.coReactant,reactantSelection.carrierGas,reactantSelection.carrierGas,reactantSelection.purgingGas,reactantSelection.purgingGas,processParameters.substrate,processParameters.substrate
Unnamed: 0_level_1,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs
0,Al2O3,9989226,Al2O3,9989226,Al2O3,9989226,(C2Hs)2Zn,11185,Al2O3,9989226,2-methyl-3-buten-2-ol (MBO),78130,Ar,23968,Ar,23968,(0 0 0 1) c-plane sapphire,9989226
1,Ga2O3,"158605, 5139834",Al2O3/ZnO,,Ga2O3,"5139834, 158605","(dimethylbutylamino)trimethylindium (DATI), tr...",,Ga2O3,"5139834, 158605","2-methyl-3-buten-2-ol (MBO, 98%, Sigma-Aldrich...",78130,Argon,23968,Argon,23968,(0 0 0 1) sapphire,9989226
2,In2-xGaxO3,,GZO,11304743,In2-xGaxO3,,Al(CH3)3,16682925,In2-xGaxO3,,Ammonia,222,Argon (Ar),23968,N2,947,(0 0 01) sapphire,9989226
3,In2S3,"16685236, 160966, 23669228",In2-xGaxO3,,In2S3,"16685236, 160966, 23669228",BDMPZ,,In2S3,"16685236, 160966, 23669228",CO2,280,N2,947,N2O,948,(0001) c-plane sapphire,9989226
4,MgO,"14893, 6850729, 14792",Pd,"5951, 23938",MgO,"14893, 6850729, 14792",DEZ,"101667988, 11185",MgO,"14893, 6850729, 14792",D2O,24602,Nitrogen,947,Nitrogen,947,(11-20) sapphire,9989226


In [59]:
# Path to save the normalization summary Excel file
output_excel_path = f"../results/statistics/ALD/version2/ZnO-IGZO-papers/experimental-usecase/ZnO"

# Create subdirectories if they do not exist
os.makedirs(output_excel_path, exist_ok=True)

# Filename for the normalization summary Excel file
filename = f"normalization_summary_{llm_model}.xlsx"

# Export the DataFrame to an Excel file
property_cid_df.to_excel(os.path.join(output_excel_path, filename))

In [None]:
# LLM model whose extracted data is to be analyzed
llm_model = "gpt-5-mini"

# Path to the extracted data directory
extracted_data_path = "../results/extracted-data-test/ALD/version4/ZnO-IGZO-papers/experimental-usecase/ZnO"

# Construct the normalization summary DataFrame
property_cid_df = construct_normalization_summary(extracted_data_path, llm_model)

In [61]:
property_cid_df.head()

Unnamed: 0_level_0,aldSystem.aldMethod.compound,aldSystem.aldMethod.compound,aldSystem.materialDeposited,aldSystem.materialDeposited,reactantSelection.precursor.compound,reactantSelection.precursor.compound,reactantSelection.precursor.precursor,reactantSelection.precursor.precursor,reactantSelection.coReactant.compound,reactantSelection.coReactant.compound,reactantSelection.coReactant.coReactant,reactantSelection.coReactant.coReactant,reactantSelection.carrierGas,reactantSelection.carrierGas,reactantSelection.purgingGas,reactantSelection.purgingGas,processParameters.substrate,processParameters.substrate
Unnamed: 0_level_1,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs,value,sameAs
0,Al-doped ZnO (AZO),,Al-doped ZnO (AZO),,Al,5359268.0,Al(CH3)3 (trimethylaluminum),"173231060, 16693595, 175303567, 173009548, 175...",Al,5359268,"0.31 mol% H2O, 0.89 mol% O2, 3.1e-3 mol% H2O2 ...",,6N high purity N2,947,Ar,23968,(0 0 0 1) c-plane sapphire (Al2O3),9989226.0
1,Al2O3,9989226,Al2O3,9989226.0,Al2O3,9989226.0,BDMPZ ([Zn(DMP)2]),,Al2O3,9989226,1% hydrogen sulfide in N2 (H2S),7833.0,Ar,23968,Ar (900 sccm),23968,(0 0 0 1) sapphire substrates,9989226.0
2,Ga2O3,"5139834, 158605",Al2O3 / ZnO / TiO2,,Al2O3 (for doping),9989226.0,DATI (dimethylbutylamino)trimethylindium,,Al2O3 (for doping),9989226,"21 mol% NH3, 0.64 mol% H2O, 0.74 mol% O2 (A ch...",,Ar (20 sccm),23968,Ar (99.999%),23968,(0001) c-plane sapphire substrate,
3,In2-xGaxO3,,Al2O3/ZnO,,B-dopant,,DEZ,"101667988, 11185",Ga2O3,"5139834, 158605",50% H2O2,784.0,Ar (99.999%),23968,Argon,23968,(0001) sapphire,9989226.0
4,In2S3,"16685236, 160966, 23669228",GZO,11304743.0,Ga-doped ZnO,,DEZ (diethyl zinc),"101667988, 11185",In1.4Ga0.6O3,,CO2,280.0,Argon,23968,"Argon (ALPHAGAZ 2, 99.9999%)",23968,(0002)-oriented sapphire substrate,9989226.0


In [62]:
# Path to save the normalization summary Excel file
output_excel_path = f"../results/statistics/ALD/version2/ZnO-IGZO-papers/experimental-usecase/ZnO"

# Create subdirectories if they do not exist
os.makedirs(output_excel_path, exist_ok=True)

# Filename for the normalization summary Excel file
filename = f"normalization_summary_{llm_model}.xlsx"

# Export the DataFrame to an Excel file
property_cid_df.to_excel(os.path.join(output_excel_path, filename))