In [1]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
import os
import yaml
import itertools
import shutil
from datetime import datetime
from pathlib import Path
from openpyxl.styles import PatternFill
from openpyxl import load_workbook



In [2]:
#loading all BICC data, does NOT include anything else (incl. the removed GISP2-GRIP)

#load cores list from params
params = f'/Users/quinnmackay/Documents/GitHub/BICC/Paleochrono BICC Work/Paleochrono BICC Experiment/BICC2025/parameters.yml'
with open(params, 'r') as f:
    first_line = f.readline()
params_load = yaml.safe_load(first_line)
list_sites = params_load['list_sites']

# get all link combos
pairs = [f"{a}-{b}" for a, b in itertools.combinations(list_sites, 2)]

pairs

['DF-EDC',
 'DF-EDML',
 'DF-NGRIP',
 'DF-GRIP',
 'DF-WDC',
 'DF-GISP2',
 'EDC-EDML',
 'EDC-NGRIP',
 'EDC-GRIP',
 'EDC-WDC',
 'EDC-GISP2',
 'EDML-NGRIP',
 'EDML-GRIP',
 'EDML-WDC',
 'EDML-GISP2',
 'NGRIP-GRIP',
 'NGRIP-WDC',
 'NGRIP-GISP2',
 'GRIP-WDC',
 'GRIP-GISP2',
 'WDC-GISP2']

In [3]:
big_table = pd.DataFrame()
error_margin = 0.2
big_table['error'] = ''

for core in list_sites: # loop through each core
    for comparison_core in list_sites: # loop through each core other than the initial load
        pair = f"{core}-{comparison_core}"
        if core != comparison_core and pair in pairs: # make sure not the same core and we skip non-existent linkages
            file = Path(f'/Users/quinnmackay/Documents/GitHub/BICC/Paleochrono BICC Work/Paleochrono BICC Experiment/BICC2025/{pair}/iceice_synchro_horizons.txt')

            with open(file, 'r') as f:
                skip = 0
                for line in f:
                    if line.startswith('#'):
                        skip += 1
                    else:
                        break

            load_data = pd.read_csv(file, sep='\t', comment='#', skiprows=skip+1, names=[core, comparison_core], usecols=[0,1])

            if core == list_sites[0] and comparison_core == list_sites[1]:
                big_table = load_data.copy()
                continue
        
            if core in big_table.columns and comparison_core not in big_table.columns:
                for idx, row in load_data.iterrows():
                    depth_core = row[core]

                    # Compute absolute distance from current depth
                    diff = (big_table[core] - depth_core).abs()

                    # Mask for rows within ±0.1
                    mask = diff <= 0.1

                    # If any row is close enough, pick the closest and update it
                    if mask.any():
                        closest_idx = diff[mask].idxmin()  # index of closest match
                        big_table.loc[closest_idx, comparison_core] = row[comparison_core]

                    if not mask.any():
                        # If no close match, append new row
                        new_row = pd.Series({core: depth_core, comparison_core: row[comparison_core]})
                        big_table = pd.concat([big_table, new_row.to_frame().T], ignore_index=True)
                continue
                
            if core in big_table.columns and comparison_core in big_table.columns:
                for idx, row in load_data.iterrows():
                    depth1_core = row[core]
                    depth2_core = row[comparison_core]

                    # Compute absolute distance from current depth
                    diff1 = (big_table[core] - depth1_core).abs()
                    diff2 = (big_table[comparison_core] - depth2_core).abs()

                    # Mask for rows within ±0.1
                    mask1 = diff1 <= 0.1
                    mask2 = diff2 <= 0.1

                    # If any row is close enough, pick the closest and update it
                    if mask1.any() and mask2.any():
                        closest_idx1 = diff1[mask1].idxmin()  # index of closest match
                        closest_idx2 = diff2[mask2].idxmin()  # index of closest match

                        if closest_idx1 != closest_idx2:
                            if pd.isna(big_table.loc[closest_idx1, comparison_core]):
                                big_table.loc[closest_idx1, comparison_core] = row[comparison_core]
                            if pd.isna(big_table.loc[closest_idx2, core]):
                                big_table.loc[closest_idx2, core] = row[core]
                            continue

                        if closest_idx1 == closest_idx2:
                            #existing match
                            big_table.loc[closest_idx1, 'error'] = 'multi-core validated'
                            continue
                    
                    if mask1.any() != mask2.any():
                        if mask1.any():
                            closest_idx = diff1[mask1].idxmin()  # index of closest match
                            big_table.loc[closest_idx, comparison_core] = row[comparison_core]
                        if mask2.any():
                            closest_idx = diff2[mask2].idxmin()  # index of closest match
                            big_table.loc[closest_idx, core] = row[core]
                        continue

                    if not mask1.any() and not mask2.any():
                        # If no close match, append new row
                        new_row = pd.Series({core: row[core], comparison_core: row[comparison_core]})
                        big_table = pd.concat([big_table, new_row.to_frame().T], ignore_index=True)
                        continue

                continue


In [4]:
big_table.reset_index(drop=True, inplace=True)
out=0
change=0
for core in list_sites: #do each core
    for idx, row in big_table.iterrows(): #do each row for each core

        if idx not in big_table.index:
            continue

        depth_core = row[core]
        diff = (big_table[core] - depth_core).abs()
        mask = diff <= 0.1 #make the mask for rows within 0.1
        mask[idx] = False #exclude self

        if mask.any(): #if any true
            runs = mask.sum() #number of trues
            for z in range(runs): #do for each true
                
                diff_filtered = diff[mask]
                if len(diff_filtered) == 0:
                    break
                closest_idx = diff_filtered.idxmin() #index of closest match

                if closest_idx not in big_table.index:
                    mask[closest_idx] = False
                    continue

                for col in row.index: #for each column in the original row once finding a match
                    if pd.isna(big_table.loc[idx, col]) and not pd.isna(big_table.loc[closest_idx, col]):
                        big_table.loc[idx, col] = big_table.loc[closest_idx, col]
                
                if big_table.loc[closest_idx, 'error'] == 'multi-core validated':
                    change+=1
                    big_table.loc[idx, 'error'] = 'multi-core validated'
                
                out+=1
                mask[closest_idx] = False
                big_table = big_table.drop(index=closest_idx)

print(f'Number of row merges: {out}')
print(f"Number of changes to 'multi-core validated': {change}")
        

Number of row merges: 109
Number of changes to 'multi-core validated': 63


In [5]:
excel_path = '/Users/quinnmackay/Desktop/temp/big_bicc_table.xlsx'

sorted_big_table = big_table.sort_values(by=list_sites[0]).reset_index(drop=True)
sorted_big_table.to_excel(f'{excel_path}', index=False)

# error coloring

error_colors = {
    'new_row_from_separate_matches': PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid'),      # Red
    'new_row_from_partial': PatternFill(start_color='ffbb00', end_color='ffbb00', fill_type='solid'),           # Light Red
    'multi-core validated': PatternFill(start_color='00FF00', end_color='00FF00', fill_type='solid'),          # Green
}

# Load workbook
wb = load_workbook(excel_path)
ws = wb.active

# Find 'error' column index (1-based for openpyxl)
error_col_idx = list(big_table.columns).index('error') + 1

# Iterate over rows (skip header)
for row_idx in range(2, ws.max_row + 1):
    cell = ws.cell(row=row_idx, column=error_col_idx)
    error_type = cell.value
    if error_type and error_type in error_colors:
        fill = error_colors[error_type]
        # Color all cells in row except error column
        for col_idx in range(1, ws.max_column + 1):
            if col_idx != error_col_idx:
                ws.cell(row=row_idx, column=col_idx).fill = fill

ws.freeze_panes = 'A2'
wb.save(excel_path)

In [6]:
excel_path = '/Users/quinnmackay/Desktop/temp/3_min_big_bicc_table.xlsx'

sorted_big_table = sorted_big_table.dropna(thresh=3, axis=0)
sorted_big_table = sorted_big_table.sort_values(by=list_sites[0]).reset_index(drop=True)
sorted_big_table.to_excel(f'{excel_path}', index=False)

# error coloring

error_colors = {
    'new_row_from_separate_matches': PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid'),      # Red
    'new_row_from_partial': PatternFill(start_color='ffbb00', end_color='ffbb00', fill_type='solid'),           # Light Red
    'multi-core validated': PatternFill(start_color='00FF00', end_color='00FF00', fill_type='solid'),          # Green
}

# Load workbook
wb = load_workbook(excel_path)
ws = wb.active

# Find 'error' column index (1-based for openpyxl)
error_col_idx = list(big_table.columns).index('error') + 1

# Iterate over rows (skip header)
for row_idx in range(2, ws.max_row + 1):
    cell = ws.cell(row=row_idx, column=error_col_idx)
    error_type = cell.value
    if error_type and error_type in error_colors:
        fill = error_colors[error_type]
        # Color all cells in row except error column
        for col_idx in range(1, ws.max_column + 1):
            if col_idx != error_col_idx:
                ws.cell(row=row_idx, column=col_idx).fill = fill

ws.freeze_panes = 'A2'
wb.save(excel_path)