In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import itertools

def get_answer_options(model1, model2):
    
    #Get all keys from the model dictionary
    model1_keys = list(model1.keys())
    model2_keys = list(model2.keys())

    substring = "indicator_Text_"

    #Use the substring above to find all the indicator_texts
    all_indicator_texts_model1 = [i for i in model1_keys if substring in i]
    all_indicator_texts_model2 = [j for j in model2_keys if substring in j]
    
    options1 = []
    options2 = []
    
    for indicator_text in all_indicator_texts_model1:
        options1.append(model1[indicator_text])
    
    for indicator_text in all_indicator_texts_model2:
        options2.append(model2[indicator_text])
    
    return options1, options2

# Load the BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def calculate_answer_similarity(options1, options2):
    # Tokenize and get BERT embeddings for each answer option
    embeddings1 = []
    embeddings2 = []
    
    match_scores = []
    chosen_matches = []
    not_matches = []
    
    for option_data1 in options1:
        tokens1 = tokenizer(option_data1, return_tensors="pt", padding=True, truncation=True)
        
        with torch.no_grad():
            embedding1 = model(**tokens1).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
            
        for option_data2 in options2:
            tokens2 = tokenizer(option_data2, return_tensors="pt", padding=True, truncation=True)

            with torch.no_grad():
                embedding2 = model(**tokens2).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
                
            similarity_score = cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0][0]

            match_scores.append({'option_1': option_data1, 'option_2': option_data2, 'similarity_score': similarity_score})

    sorted_possible_matches = sorted(match_scores, key=lambda x: x['similarity_score'], reverse=True)
    
    chosen_options = set()

    for possible_match in sorted_possible_matches:
        option_1 = possible_match['option_1']
        option_2 = possible_match['option_2']

        if possible_match['similarity_score'] >= 0.87:
            if option_1 not in chosen_options and option_2 not in chosen_options:
                chosen_matches.append((option_1, option_2))
                chosen_options.add(option_1)
                chosen_options.add(option_2)
        else:
            if option_1 not in chosen_options:
                not_matches.append(option_1)
                chosen_options.add(option_1)
            if option_2 not in chosen_options:
                not_matches.append(option_2)
                chosen_options.add(option_2)
            
        
    return chosen_matches, not_matches


In [15]:
%%time

from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from pint import UnitRegistry

# Load the BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
ureg = UnitRegistry()


def generate_back_conversion_description(original_unit, converted_unit):
    try:
        # Attempt to find a back conversion formula
        back_conversion = ureg(original_unit).to(converted_unit)
        return f"Formula: {original_unit} to {converted_unit} = {back_conversion}"
    except:
        return None
    
    
def process_transformation_formula(model1, model2, output_file):
    chosen_model = choose_model(model1, model2)

    name = chosen_model.get('Name', '')
    preunit = chosen_model.get('indicator_PreUnit', '')
    postunit = chosen_model.get('indicator_PostUnit', '')

    preunit_found = False
    postunit_found = False

    # Adding Formula fragment2 for PreUnit
    for model in [model1, model2]:
        if model != chosen_model and "indicator_PreUnit" in model:
            back_conversion_description_preunit = generate_back_conversion_description(
                preunit, model.get('indicator_PreUnit', '')
            )
            if back_conversion_description_preunit:
                output_file.write(f'indicator_FormulaIndicator1: {preunit} * 1"\n')
                output_file.write(f'indicator_FormulaIndicator2: {back_conversion_description_preunit}"\n')
                preunit_found = True

    # Adding Formula fragment2 for PostUnit
    for model in [model1, model2]:
        if model != chosen_model and "indicator_PostUnit" in model:
            post_unit_value = model.get('indicator_PostUnit', '')
            post_unit_value = post_unit_value.strip('\"')  # Remove quotes if present
            back_conversion_description_postunit = generate_back_conversion_description(
                postunit, post_unit_value
            )
            if back_conversion_description_postunit:
                output_file.write(f'indicator_FormulaIndicator1: "{postunit} * 1"\n')
                output_file.write(f'indicator_FormulaIndicator2: "{back_conversion_description_postunit}"\n')
                postunit_found = True

    # Check if neither preunit nor postunit is found
    if not preunit_found and not postunit_found:
        output_file.write('indicator_FormulaIndicator1: Not found"\n')
        output_file.write('indicator_FormulaIndicator2: "Not found"\n')

    # Separate each pair with an additional newline
    output_file.write("\n")

        
    
# Function to calculate similarity score between two "topic_model" sections
def calculate_similarity_attributes(model1, model2):
    # Tokenize and get BERT embeddings for each attribute
    embeddings1 = {}
    embeddings2 = {}
    
    for attribute in ["topic_Description", "indicator_Name"]:
        value1 = model1[attribute]
        value2 = model2[attribute]
        
        tokens1 = tokenizer(value1, return_tensors="pt", padding=True, truncation=True)
        tokens2 = tokenizer(value2, return_tensors="pt", padding=True, truncation=True)
        
        with torch.no_grad():
            embedding1 = model(**tokens1).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
            embedding2 = model(**tokens2).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        
        embeddings1[attribute] = embedding1
        embeddings2[attribute] = embedding2
    
    # Calculate the overall similarity score (average)
    similarity_scores = [cosine_similarity([embeddings1[attribute]], [embeddings2[attribute]])[0][0]
                         for attribute in ["topic_Description", "indicator_Name"]]
    
    overall_similarity = sum(similarity_scores) / len(similarity_scores)
    
    return overall_similarity, similarity_scores


def calculate_similarity_question(model1, model2):
    # Tokenize and get BERT embeddings for each attribute
    embeddings1 = {}
    embeddings2 = {}
    
    for attribute in ["indicator_Description"]:
        value1 = model1[attribute]
        value2 = model2[attribute]
        
        tokens1 = tokenizer(value1, return_tensors="pt", padding=True, truncation=True)
        tokens2 = tokenizer(value2, return_tensors="pt", padding=True, truncation=True)
        
        with torch.no_grad():
            embedding1 = model(**tokens1).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
            embedding2 = model(**tokens2).last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        
        embeddings1[attribute] = embedding1
        embeddings2[attribute] = embedding2
    
    # Calculate the overall similarity score (average)
    similarity_scores = [cosine_similarity([embeddings1[attribute]], [embeddings2[attribute]])[0][0]
                         for attribute in ["indicator_Description"]]
    
    overall_similarity = sum(similarity_scores) / len(similarity_scores)
    
    return overall_similarity, similarity_scores



# Function to parse "fragment_model" sections from a data file and assign names
def parse_fragment_models(file_path, name_prefix):
    fragment_models = []
    with open(file_path, "r") as file:
        data = file.read()
    
    # Split data into individual "fragment_model" sections
    sections = data.split("\n\n")  # Assuming sections are separated by blank lines
    
    for i, section in enumerate(sections):
        attributes = {}
        lines = section.strip().split("\n")
        for line in lines:
            key, value = line.split(":", 1)
            attributes[key.strip()] = value.strip()
        # Assign names with prefixes
        attributes["name"] = f"{name_prefix}{i + 1}"
        fragment_models.append(attributes)
    
    return fragment_models

# Function to check if the combination of data types is valid
def is_data_type_valid(model1, model2):
    invalid_rules_data_type = [("double", "boolean"), ("boolean", "double"), ("integer", "boolean"), ("boolean", "integer"), 
                               ("text", "boolean"), ("boolean", "text"), ("text", "date"), ("date", "text"), 
                               ("integer", "date"), ("date", "integer"), ("double", "date"), ("date", "double"), 
                               ("boolean", "date"), ("date", "boolean"),("boolean", "multipleChoice"), ("multipleChoice", "boolean")
                              ,("singleChoice", "multipleChoice"), ("multipleChoice", "singleChoice")] 
    data_types = (model1['indicator_DataType'], model2['indicator_DataType'])
    
    if (data_types in invalid_rules_data_type):
        return False
    return True

# Function to choose the appropriate model based on data type rules
def choose_model(model1, model2):
    datatype_rules = {
        ("text", "integer"): model2,
        ("integer", "text"): model1,
        ("text", "double"): model2,
        ("double", "text"): model1,
        ("text", "singleChoice"): model2,
        ("singleChoice", "text"): model1,
        ("text", "multipleChoice"): model2,
        ("multipleChoice", "text"): model1,
        ("double", "integer"): model1,
        ("integer", "double"): model2,
        ("singleChoice", "integer"): model1,
        ("integer", "singleChoice"): model2,
        ("multipleChoice", "integer"): model1,
        ("integer", "multipleChoice"): model2,
        ("singleChoice", "double"): model1,
        ("double", "singleChoice"): model2,
        ("multipleChoice", "double"): model1,
        ("double", "multipleChoice"): model2,
        ("singleChoice", "date"): model1,
        ("date", "singleChoice"): model2,
        ("multipleChoice", "date"): model1,
        ("date", "multipleChoice"): model2,
        ("singleChoice", "boolean"): model2,
        ("boolean", "singleChoice"): model1,
        ("multipleChoice", "singleChoice"): model2,
        ("singleChoice", "multipleChoice"): model1,
        ("text", "text"): model1,
        ("integer", "integer"): model1,
        ("double", "double"): model1,
        ("date", "date"): model1,
        ("boolean", "boolean"): model1,
        ("multipleChoice", "multipleChoice"): model1,
        ("singleChoice", "singleChoice"): model1,
    }
    
    data_types = (model1['indicator_DataType'], model2['indicator_DataType'])
    
    if data_types in datatype_rules:
        return datatype_rules[data_types]
    else:
        raise Exception(f'Unknown data type combination found {data_types}')

# Function to get formatted string for writing to files
def get_formatted_write_string(model):
    string_to_write = ""
    for key in model.keys():
        string_to_write += f"{key}:{model[key]}\n"
    return string_to_write

# Main function to perform matching and merging and write output to files
def match_and_merge_output(data1_path, data2_path, threshold_indicator, threshold_question, all_scores_file_path, merged_file_path, matched_file_path, unmatched_file_path):
    fragment_models1 = parse_fragment_models(data1_path, "Model_A")
    fragment_models2 = parse_fragment_models(data2_path, "Model_B")

    # Initialize a list to store matched models
    possible_matches = []
    chosen_matches = []

    # Open files for writing
    with open(all_scores_file_path, "w") as all_scores_file, \
            open(merged_file_path, "w") as merged_file, \
            open(matched_file_path, "w") as matched_file, \
            open(unmatched_file_path, "w") as unmatched_file:

        # List to store matched models and their similarity scores
        matched_models = []
        
        # Compare and merge "topic_model" sections
        for model1, model2 in itertools.product(fragment_models1, fragment_models2):
            overall_similarity_indicator, attribute_scores_indicator = calculate_similarity_attributes(model1, model2)
            overall_similarity_question, attribute_scores_question = calculate_similarity_question(model1, model2)
            
             
            if all(score >= threshold_indicator for score in attribute_scores_indicator) and all(score >= threshold_question for score in attribute_scores_question):
            # Write all similarity scores to the Similarity_scores.txt file
                all_scores_file.write(f"Similarity Scores for {model1['name']} and {model2['name']}:\n")
                # Calculate similarity scores for attributes using calculate_similarity_attributes function
                overall_similarity_attributes, attribute_scores_attributes = calculate_similarity_attributes(model1, model2)
                for attribute, score in zip(["topic_Description", "indicator_Name"], attribute_scores_attributes):
                    all_scores_file.write(f"{attribute} Score: {score:.2f}\n")

                # Calculate similarity scores for indicator_Description using calculate_similarity_question function
                overall_similarity_question, attribute_scores_question = calculate_similarity_question(model1, model2)
                for attribute, score in zip(["indicator_Description"], attribute_scores_question):
                    all_scores_file.write(f"{attribute} Score: {score:.2f}\n")

                # Calculate the average of the individual scores
                average_individual_scores = sum(attribute_scores_attributes + attribute_scores_question) / (len(attribute_scores_attributes) + len(attribute_scores_question))
                all_scores_file.write(f"Overall Similarity (Average of Individual Scores): {average_individual_scores:.2f}\n\n")
            
          # Check if similarity scores meet the threshold for "topic_Description", "indicator_Name" and "indicator_Description"
            if all(score >= threshold_indicator for score in attribute_scores_indicator) and all(score >= threshold_question for score in attribute_scores_question):
                # Check if Indicator_types are equal to each other and if the data types are valid 
                if model1['indicator_Indicator_type'] == model2['indicator_Indicator_type'] and is_data_type_valid(model1, model2):
                    similarity_score = average_individual_scores

                    # Append matched models and their similarity scores to the list
                    matched_models.append({'model_A': model1, 'model_B': model2, 'similarity_score': similarity_score})

                else:
                    print(f"Do not match: Model 1 datatype: {model1['indicator_DataType']} Model 2 datatype: {model2['indicator_DataType']}")


        # Sort matched models by similarity score in descending order
        sorted_possible_matches = sorted(matched_models, key=lambda x: x['similarity_score'], reverse=True)
        
  
        # Iterate through sorted matches and select unique models
        for possible_match in sorted_possible_matches:
            if not any(possible_match['model_A'] in item or possible_match['model_B'] in item for item in chosen_matches):

                
                model1 = possible_match['model_A']
                model2 = possible_match['model_B']
                    
                # In case two sorted possible matches have datatype multipleChoice:
                if model1['indicator_DataType'] == 'multipleChoice' and model2['indicator_DataType'] == 'multipleChoice':
                    options_model1, options_model2 = get_answer_options(model1,model2)
                    matches_answers, not_match_answers = calculate_answer_similarity(options_model1, options_model2)


                    #Get all keys from the model dictionary
                    model_keys = list(model1.keys())

                    substring = "indicator_Text_"

                    #Use the substring above to find all the indicator_texts
                    all_indicator_texts = [i for i in model_keys if substring in i]

                    
                    #Remove all the indicator_texts
                    for indicator_text in all_indicator_texts:
                        del model1[indicator_text]

                    answer_counter = 1
                    for possible_answer_of_match in matches_answers:

                        chosen_matched_answer = possible_answer_of_match[0]

                        key_name = "indicator_Text_" + str(answer_counter)

                        model1[key_name] = chosen_matched_answer


                        answer_counter += 1


                    for unmatched_answer in not_match_answers:

                        key_name = "indicator_Text_" + str(answer_counter)

                        model1[key_name] = unmatched_answer

                        answer_counter += 1
                        
                        
                    chosen_matches.append((model1, model2))

                    
                # In case two sorted possible matches have datatype singleChoice:
                elif model1['indicator_DataType'] == 'singleChoice' and model2['indicator_DataType'] == 'singleChoice':   
                    options_model1, options_model2 = get_answer_options(model1, model2)
                    matches_answers, not_match_answers = calculate_answer_similarity(options_model1, options_model2)

                    if not bool(not_match_answers):

                        #Get all keys from the model dictionary
                        model_keys = list(model1.keys())

                        substring = "indicator_Text_"

                        #Use the substring above to find all the indicator_texts
                        all_indicator_texts = [i for i in model_keys if substring in i]

                        print(model1)

                        #Remove all the indicator_texts
                        for indicator_text in all_indicator_texts:
                            del model1[indicator_text]


                        answer_counter = 1
                        for possible_answer_of_match in matches_answers:
                            chosen_matched_answer = possible_answer_of_match[0]

                            key_name = "indicator_Text_" + str(answer_counter)

                            model1[key_name] = chosen_matched_answer

                            answer_counter += 1  
                    
                    
                        chosen_matches.append((model1, model2))
              
                else: chosen_matches.append((model1, model2))
                
        
            
        # Write matched models to merged_file and matched_file, and unmatched models to unmatched_file
        for match in chosen_matches:
            chosen_model = choose_model(match[0], match[1])
            formatted_string = get_formatted_write_string(chosen_model)
            merged_file.write(formatted_string)
             #Additional information from "indicator_Name" about Method A and Method B in case of a merge is written to the merged file
            merged_file.write(f"indicator_Indicator1: {match[0]['indicator_Name']}\n")
            merged_file.write(f"indicator_Indicator2: {match[1]['indicator_Name']}\n")
            #The process_transformation_formula function is called to process and write transformation formulas to the merged file.
            process_transformation_formula(match[0],match[1],merged_file)
            matched_file.write(formatted_string)
            matched_file.write(f"indicator_Indicator1: {match[0]['indicator_Name']}\n")
            matched_file.write(f"indicator_Indicator2: {match[1]['indicator_Name']}\n")
            process_transformation_formula(match[0],match[1],matched_file)
            matched_file.write("\n")

        for model in itertools.chain(fragment_models1, fragment_models2):
            if not any(model in item for item in chosen_matches):
                formatted_string = get_formatted_write_string(model)
                merged_file.write(formatted_string)
                merged_file.write("\n")
                unmatched_file.write(formatted_string)
                unmatched_file.write("\n")
                

    print("Process completed successfully.")

# Paths for input data and output files
data1_path = "/Users/Documents/Main/Match_and_Merge/Method_Model_A.txt"
data2_path = "/Users/Documents/Main/Match_and_Merge/Method_Model_B.txt"
all_scores_file_path = "/Users/Documents/Main/Match_and_Merge/Similarity_scores.txt"
merged_file_path = "/Users/Documents/Main/Postprocessing/Merged_models.txt"
matched_file_path = "/Users/Documents/Main/Match_and_Merge/Matched.txt"
unmatched_file_path = "/Users/Documents/Main/Match_and_Merge//Unmatched.txt"

# Threshold 
threshold_indicator = 0.65
threshold_question = 0.87

# Call the main function to perform matching and write output to files
match_and_merge_output(data1_path, data2_path, threshold_indicator, threshold_question, all_scores_file_path, merged_file_path, matched_file_path, unmatched_file_path)


Do not match: Model 1 datatype: multipleChoice Model 2 datatype: singleChoice
Do not match: Model 1 datatype: text Model 2 datatype: double
Do not match: Model 1 datatype: double Model 2 datatype: singleChoice
Process completed successfully.
CPU times: user 1min 13s, sys: 926 ms, total: 1min 14s
Wall time: 1min 15s
