In [14]:
import os
import tarfile
import numpy as np
# import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import corner 
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# #Some colors that Pablo likes:
colors = [
    "#1f77b4", # Vivid blue
    "#ff7f0e", # Bright orange
    "#2ca02c", # Rich green
    "#d62728", # Strong red
    "#9467bd", # Deep purple
    # "#8c564b", # Brownish-pink
    "#e377c2", # Pink
    "#7f7f7f", # Medium gray
    "#bcbd22", # Lime green
    "#17becf", # Cyan
    "#393b79", # Dark blue
    "#637939", # Olive green
    "#8c6d31", # Bronze
    # "#843c39", # Dark red
    # "#ad494a", # Reddish brown
    "#d6616b", # Soft red
    "#e7ba52", # Golden yellow
    "#7b4173", # Dark purple
    "#a55194", # Mauve
    "#ce6dbd", # Light purple
]

colors_sets = [
    "#ff7f0e",

    "#1f77b4",

    "#2ca02c",
    "#d62728",
    
    "#9467bd",
    "#8c564b",
    "#e377c2",
    "#7f7f7f",
    "#bcbd22",
    "#17becf",
    '#1f77b4',  # muted blue
    'r',  
] 

markers = [
    "o",  # Circle
    "^",  # Triangle up
    "s",  # Square
    "P",  # Plus (filled)
    "*",  # Star
    "X",  # X (filled)
    "D",  # Diamond
    "H",  # Hexagon
]


colors_models = colors


Selected_element=50
Selected_element_name="Sn"

color_train=colors_sets[9]
color_validation='orange'
color_test=colors_sets[3]

marker_train='s'
marker_validation='*'
marker_test='o'



size_train=30
size_validation=80
size_test=35

alpha_train=0.8
alpha_validation=0.9
alpha_test=0.4

models = [ 'ME2', 'MEdelta', 'PC1', 'NL3S', 'SKMS', 'SKP', 'SLY4', 'SV', 'UNEDF0', \
        'UNEDF1']#Here I am copying all the models with Charge Raidus and Masses
models_selected = [ 'ME2', 'MEdelta', 'PC1', 'NL3S', 'SKMS', 'SKP', 'SLY4', 'SV', 'UNEDF0', \
        'UNEDF1']#Here I am copying all the models with Charge Raidus and Masses
key_list = models_selected

heterogeneous_data_type = ['BE', 'ChRad', 'CPn', 'CPp', 'PEn', 'PEp', 'QDB2n', 'QDB2p', 'QDB4n', 'QDB4p', 'MRadN', 'MRadP']
num_properties = 2

### Some functions that I use for data processing

In [3]:
"""
The functions that I define in here is basically the separting algorithm in which we have used
for separting our dataset into training, validating, and testing region
"""
def separate_points_random(list1,random_chance):
    """
    Separates points in list1 into two groups randomly

    """
    train = []
    test = []

    train_list_coordinates=[]
    test_list_coordinates=[]


    for i in range(len(list1)):
        point1=list1[i]
        val=np.random.rand()
        if val<=random_chance:
            train.append(point1)
            train_list_coordinates.append(i)
        else:
            test.append(point1)
            test_list_coordinates.append(i)

    return np.array(train), np.array(test), np.array(train_list_coordinates), np.array(test_list_coordinates)

def separate_points_distance(list1, list2, distance):
    """
    Separates points in list1 into two groups based on their proximity to any point in list2.

    :param list1: List of (x, y) tuples.
    :param list2: List of (x, y) tuples.
    :param distance: The threshold distance to determine proximity.
    :return: Two lists - close_points and distant_points.
    """
    train = []
    test = []

    train_list_coordinates=[]
    test_list_coordinates=[]

    for i in range(len(list1)):
        point1=list1[i]
        close = False
        for point2 in list2:
            if np.linalg.norm(np.array(point1) - np.array(point2)) <= distance:
                close = True
                break
        if close:
            train.append(point1)
            train_list_coordinates.append(i)
        else:
            test.append(point1)
            test_list_coordinates.append(i)

    return np.array(train), np.array(test), np.array(train_list_coordinates), np.array(test_list_coordinates)

def separate_points_distance_allSets(list1, list2, distance1, distance2):
    """
    Separates points in list1 into three groups based on their proximity to any point in list2.

    :param list1: List of (x, y) tuples.
    :param list2: List of (x, y) tuples.
    :param distance: The threshold distance to determine proximity.
    :return: Two lists - close_points and distant_points.
    """
    train = []
    validation=[]
    test = []

    train_list_coordinates=[]
    validation_list_coordinates=[]
    test_list_coordinates=[]

    for i in range(len(list1)):
        point1=list1[i]
        close = False
        for point2 in list2:
            if np.linalg.norm(np.array(point1) - np.array(point2)) <= distance1:
                close = True
                break
        if close:
            train.append(point1)
            train_list_coordinates.append(i)
        else:
            close2=False
            for point2 in list2:
                if np.linalg.norm(np.array(point1) - np.array(point2)) <= distance2:
                    close2 = True
                    break
            if close2==True:
                validation.append(point1)
                validation_list_coordinates.append(i)
            else:
                test.append(point1)
                test_list_coordinates.append(i)                

    return np.array(train),np.array(validation), np.array(test), np.array(train_list_coordinates),  np.array(validation_list_coordinates),np.array(test_list_coordinates)

In [5]:
def NZ_synchronization(models_data_sets, models_selected):
    """
    This function will helps synchronize our isotopes domain for all models that we choose to analyze. 
    models_data_sets will be the dataset of models' predictions without synchronized domain,
    models_selected are models that we want to include, and filtered_NZ is just some initial list for 
    """
    # This line takes out isotopes of the first model and use that as the input for the selection step
    filtered_NZ = np.array([models_data_sets[models_selected[0]]['N'].tolist(),\
                             models_data_sets[models_selected[0]]['Z'].tolist() ]).T

    for model in models_selected:
        filtered_NZ_new = []
        for isotope in filtered_NZ:
            if ( (isotope[0] == models_data_sets[model]['N']) & (isotope[1] == models_data_sets[model]['Z']) ).any(): # Choose nuclei that are contained in each model
                filtered_NZ_new.append(isotope)
        filtered_NZ = np.array(filtered_NZ_new) #update our new list of isotope and repeat this for every model
    filtered_NZ_df = pd.DataFrame({'N' : filtered_NZ.T[0], 'Z' : filtered_NZ.T[1]})
    return filtered_NZ_new

def filtered_NZ_extraction(filtered_NZ):
    """
    This will be the useful when we want to extract valid domain X for the model to work on. The code 
    that I am writing right now are selecting isotopes with proton and neutron number is bigger than 8 and
    even. 
    """
    filtered_NZ_new = []
    for isotope in filtered_NZ:
        if ((isotope[0] >= 8) & (isotope[1] >=8)) & ((isotope[0]%2 == 0) & (isotope[1]%2 == 0)): # Sort the even-even isotope with proton and neutron number 8 or above
            filtered_NZ_new.append(isotope)
    filtered_NZ_new  = np.array(filtered_NZ_new)
    return filtered_NZ_new

def selected_models_data_sets_extraction(models_data_sets, models_selected, filtered_NZ_df, property):
    """
    This helps create dataframe of models' predictions on filtered isotope
    """
    selected_models_data_sets = pd.DataFrame(filtered_NZ_df) #Initiate the dataframe by the proton and neutron number
    for model in models_selected:
        merged_df = pd.merge(filtered_NZ_df, pd.DataFrame(models_data_sets[model]), on = ['N', 'Z'], how  = 'inner') # Merge the isotope with their corresponding predictions
        selected_models_data_sets[model] = merged_df[property] # Choose the properties we want to perfrom BMM from
    return selected_models_data_sets


In [6]:
""""
This cell helps you to extract a dictionary that contains all the nuclei with BE and charge
radius predictions of all the models we have"
"""
models_data_sets = {}
for model in models:
    Data_Values = pd.read_hdf("../data./selected_data.h5", key = model)
    models_data_sets[model] = {"N" : Data_Values["N"], "Z" : Data_Values["Z"], \
                               "BE" : Data_Values["BE"], 'ChRad': Data_Values['ChRad']}

"""
This part extracts all the isotopes that exist in all models and satisfy our condition: In this case, 
N and Z is bigger than 8 and are even.
"""
merged_NZ = NZ_synchronization(models_data_sets, models_selected)
filtered_NZ = filtered_NZ_extraction(merged_NZ)
filtered_NZ_df = pd.DataFrame({'N' : filtered_NZ.T[0], 'Z' : filtered_NZ.T[1]})

In [7]:
"""
This is the data that I store in the "data" folder on the "Model0thorgonalization" repo - the
3-11-2025-check-point

"""
truth_CR_df = pd.read_csv(r'C:/Users/congn/OneDrive/Desktop/An Le Materials/ModelOrthogonalization/data/charge_radii.csv') # I download the radii data separately
"""
This is just to rename the columns so that I can merge data set later
"""
truth_CR_df.rename(columns = {'z' : 'Z', 'n' : 'N'}, inplace = True) 

In [8]:
# Stable isotopes is the basis that we use for separating our data into train, validation, test
stable_coordinates_full=np.loadtxt("../Stable-Isotopes.txt")
# Make sure that nuclei we analyze have proton and neutron number of 8 or above
stable_coordinates = filtered_NZ_extraction(stable_coordinates_full) 

In [9]:
# Below are dataframes of model's predictions only for charge radii and binding energies
selected_models_data_sets_mass = selected_models_data_sets_extraction(models_data_sets, models_selected,\
                                                                       filtered_NZ_df, 'BE')
selected_models_data_sets_radius = selected_models_data_sets_extraction(models_data_sets, models_selected,\
                                                                         filtered_NZ_df, 'ChRad')

# This dataset does not yet contain the real data, let's now merge this dataset with the real data
truth_BE = pd.read_hdf("../data./selected_data.h5", key = 'AME2020')
truth_BE_df = pd.DataFrame(truth_BE)
# Add real data to the dataframe
selected_models_data_sets_mass = pd.merge(selected_models_data_sets_mass, truth_BE[['N', 'Z', 'BE']],\
                                           on = ['N', 'Z'], how = 'inner')
# Rename our truth column
selected_models_data_sets_mass.rename(columns = {'BE' : 'truth'}, inplace = True)
# Reorganizing the order of models' predictions
cols_mass = list(selected_models_data_sets_mass.keys())
cols_mass.remove('truth')
cols_mass.insert(2, 'truth') # These 2 lines rearrange the order fo the column
selected_models_data_sets_mass = selected_models_data_sets_mass[cols_mass]

# After merging with the experimental data, we will want to redefine our isotope domain:
filtered_NZ_df_mass = selected_models_data_sets_mass[['N', 'Z']]
filtered_NZ_mass = np.array(filtered_NZ_df_mass)



# We did the same for the the charge radii data
selected_models_data_sets_radius = pd.merge(selected_models_data_sets_radius, truth_CR_df[['N',\
                                             'Z', 'radius_val']], how = 'inner', on = ['N', 'Z'])
# Rename our experimental data
selected_models_data_sets_radius.rename(columns = {'radius_val' : 'truth'}, inplace = True)
# Reorganizing the order of models' predictions
cols_radius = list(selected_models_data_sets_radius.keys())
cols_radius.remove('truth')
cols_radius.insert(2, 'truth')
selected_models_data_sets_radius = selected_models_data_sets_radius[cols_radius]
# We want to drop any NaN value from our dataset
selected_models_data_sets_radius = selected_models_data_sets_radius.copy().dropna().reset_index(drop = True)

# After merging with the experimental data, we will want to redefine our isotope domain:
filtered_NZ_df_radius = selected_models_data_sets_radius[['N', 'Z']]
filtered_NZ_radius = np.array(filtered_NZ_df_radius)


In [None]:
"""
After gettting the dataset that we want to work with, we will want to separate the data into training, validating, and testing sets
This could be achieved with the separating algorithm function that I defined above. The code above extracts these data for charge radii
"""

distance1=1.75
distance2=2.5

training_set_radius, validation_set_radius, test_set_radius, train_coordinates_radius, validation_coordinates_radius,test_coordinates_radius=separate_points_distance_allSets(filtered_NZ_radius, stable_coordinates, distance1,distance2)

In [None]:
"""
After gettting the dataset that we want to work with, we will want to separate the data into training, validating, and testing sets
This could be achieved with the separating algorithm function that I defined above. The code above extracts these data for mass
"""

distance1=2
distance2=3

training_set_mass, validation_set_mass, test_set_mass, train_coordinates_mass, validation_coordinates_mass,test_coordinates_mass=separate_points_distance_allSets(filtered_NZ_mass, stable_coordinates, distance1,distance2)

### Ideas that I thought you could think for pybmc Dataset class
+ What I want to achieve is to write an efficient class that can swiftly extract: a dataset that contains all models predictions extracted from the source, the isotopes that all exist in all models (or the domain X that the model mixing will work on), a function that extract subset of these domain (depends on the condition that we want to extract), the separation data algorithm that extract training, validaiton, and test datasets. In this notebook, I have separate functions that have all of these utilities, but they are still urdimentary functions that needs a lot of individual adjustment.
+ A few problems with this: I have 2 data files that I extract on: the selected_data.h5 (contains all the models) and the charge_radii.csv (contains experimental data for charge radii). I do not know yet know to effectively write a function in the Dataset class that can extract both of these effectively, and then merge the experimental data with the models (I need to write a long ass code in in cell 7 to do this). I also do not know how to generalize the filtered_NZ_extraction (which also correspond to get_subset method) so that we can work with different conditions. Lastly, what I wrote in here only accounts for the case when we have nuclei's model, which is not generalizable for cases in which the users can extract their own models in and then perform these data-organization. 
+ You can start working on this and give me any ideas on how to fix this (even if you think that something are not necessary, or we should not approach the problem this way, please discuss)