In [126]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
import os
import yaml
import itertools
import shutil
from datetime import datetime
from pathlib import Path
from openpyxl.styles import PatternFill
from openpyxl import load_workbook
from collections import defaultdict
import sys
from openpyxl.styles import Border, Side
from openpyxl.styles import Alignment, Font


In [None]:
# cores list of sites for AA

list_sites = ['EDC', 'WDC', 'EDML', 'DF', 'TALDICE']
project = 'AA_14Cols'

# get all link combos
pairs = [f"{a}-{b}" for a, b in itertools.combinations(list_sites, 2)]

error_margin = 0.1
big_error_margin = 0.25
pairs

['EDC-WDC',
 'EDC-EDML',
 'EDC-DF',
 'EDC-TALDICE',
 'WDC-EDML',
 'WDC-DF',
 'WDC-TALDICE',
 'EDML-DF',
 'EDML-TALDICE',
 'DF-TALDICE']

In [None]:
big_table = pd.DataFrame()

for core in list_sites: # loop through each core
    for comparison_core in list_sites: # loop through each core other than the initial load
        pair = f"{core}-{comparison_core}"
        if core != comparison_core and pair in pairs: # make sure not the same core and we skip non-existent linkages
            file = Path(f'/Users/quinnmackay/Documents/GitHub/BICC/Antarctic Chronology Accuracy Project/{project}/{pair}/iceice_synchro_horizons.txt')

            if not file.exists():
                continue

            with open(file, 'r') as f:
                skip = sum(1 for line in f if line.startswith('#'))

            load_data = pd.read_csv(file, sep='\t', comment='#', skiprows=skip+1, names=[core, comparison_core], usecols=[0,1])

            # rename to create unique columns for this pair
            load_data = load_data.rename(columns={
                core: f"{pair}_{core}",
                comparison_core: f"{pair}_{comparison_core}"
            })

            # append rows (block)
            big_table = pd.concat([big_table, load_data],
                                  axis=0,
                                  ignore_index=True)


In [129]:
core_groups = defaultdict(list)
matching_groups = defaultdict(list)

for col in big_table.columns:
    suffix = col.split("_")[-1]
    core_groups[suffix].append(col) #group cols by suffix
    
    match = col.split("_")[0]
    core1 = match.split("-")[0]
    core2 = match.split("-")[1]

    if core1 == suffix:
        matching_core = core2
    elif core2 == suffix:
        matching_core = core1

    matching_groups[suffix].append(f"{match}_{matching_core}")

update_check = 0
refresh = 1
while refresh > 0:
    refresh = 0
    for core, assoc_cols in core_groups.items():
        matching_cols = matching_groups[core]

        for col, match_col in zip(assoc_cols, matching_cols):
            for col_check in assoc_cols:
                if col == col_check:
                    continue

                col_updates = {}
                match_updates = {}

                for index, value in big_table[col].items():
                    diff = (big_table[col_check] - value).abs()
                    matching_indices = diff[diff <= error_margin].index

                    for match_idx in matching_indices:
                        col_updates[match_idx] = big_table[col].at[index]
                        match_updates[match_idx] = big_table[match_col].at[index]
            
                for match_idx, new_val in col_updates.items():
                    if pd.isna(big_table.at[match_idx, col]):
                        big_table.at[match_idx, col] = new_val
                        update_check+=1
                        refresh+=1
                for match_idx, new_val in match_updates.items():
                    if pd.isna(big_table.at[match_idx, match_col]):
                        big_table.at[match_idx, match_col] = new_val
            
    print(f'total updates made: {update_check} (+{refresh})')

duplicates_mask = big_table.duplicated(keep='first')
num_dupe = (len(duplicates_mask[duplicates_mask == True]))
big_table_cleaned = big_table.drop_duplicates(keep='first').reset_index(drop=True)
print(f'Reduced table by {num_dupe/len(big_table)*100:.2f}% due to duplicates')

# Do evaluation for errors
within_row_errors = []
within_row_errors_core = []
within_row_big_errors = []
within_row_big_errors_core = []
for index, row in big_table_cleaned.iterrows(): # Iterate over every row in the table
    for core, columns in core_groups.items(): # For each core and its associated list of column names
        values = []
        for col in columns: # Collect the values for this core on this row
            values.append(row[col])
        values = [v for v in values if not pd.isna(v)] # Remove NaN values so they don't interfere with comparison
        if len(values) >= 2:
            diff = abs(max(values) - min(values))
            if diff >= error_margin and index not in within_row_errors:
                within_row_errors.append(index)
                within_row_errors_core.append(core)
                #print(f"Row {index} core {core} values: {values} diff={diff}")
            elif diff >= big_error_margin and index not in within_row_big_errors:
                within_row_big_errors.append(index)
                within_row_big_errors_core.append(core)
print(f'Identified rows with within-row errors, {len(within_row_errors)} total')
print(f'Identified rows with big within-row errors, {len(within_row_big_errors)} total')

rename_map = {}
for suffix, cols in core_groups.items():
    for col in cols:
        rename_map[col] = suffix  # rename to suffix only
big_table_cleaned.rename(columns=rename_map, inplace=True)
print('Renamed all columns to their suffix')

index_v = True
min_cols_per = {}

min_cols_export = 0
excel_path = '/Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged.xlsx'
big_table_cleaned = big_table_cleaned[big_table_cleaned.notna().sum(axis=1) >= min_cols_export]
big_table_cleaned.to_excel(excel_path, index=index_v)
min_cols_per[excel_path] = min_cols_export
print(f'Exported cleaned table to excel at {excel_path}')

min_cols_export = 3
excel_path = '/Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged_3plus.xlsx'
filtered_big_table = big_table_cleaned[big_table_cleaned.notna().sum(axis=1) >= min_cols_export]
filtered_big_table.to_excel(excel_path, index=index_v)
min_cols_per[excel_path] = min_cols_export
print(f'Exported 3+ filtered table to excel at {excel_path}')

total updates made: 9618 (+9618)
total updates made: 10061 (+443)
total updates made: 10069 (+8)
total updates made: 10069 (+0)
Reduced table by 45.18% due to duplicates
Identified rows with within-row errors, 116 total
Identified rows with big within-row errors, 12 total
Renamed all columns to their suffix
Exported cleaned table to excel at /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged.xlsx
Exported 3+ filtered table to excel at /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged_3plus.xlsx


In [130]:
excel_paths = list(min_cols_per.keys())

for excel_path in excel_paths:
    wb = load_workbook(excel_path)
    ws = wb.active
    print(f"Loaded workbook {excel_path} for styling")

    # Load headers
    headers = [cell.value for cell in ws[1]]

   # error coloring and error column
    max_columns = ws.max_column
    if index_v:
        headers_to_color = headers[1:]
        start_col = 2   # Excel column index: 1 = index col, 2 = real col 1
        ws["A1"].value = "Index"
        ws["A1"].font = Font(bold=True)
        ws["A1"].alignment = Alignment(horizontal="center", vertical="center")

        for row_idx in range(2, ws.max_row + 1):
            cell = ws.cell(row=row_idx, column=1)
            cell.font = Font(bold=False)
            cell.alignment = Alignment(horizontal="center", vertical="center")
            if cell.value in within_row_errors:
                cell.fill = PatternFill(start_color="ffd966", end_color="ffd966", fill_type='solid')

                within_idx = within_row_errors.index(cell.value)
                error_cell = ws.cell(row=row_idx, column=max_columns + 1)
                existing_str = str(error_cell.value) if error_cell.value is not None else ""
                if existing_str == "":
                    error_cell.value = within_row_errors_core[within_idx]
                else:
                    error_cell.value = existing_str + "," + within_row_errors_core[within_idx]
            if cell.value in within_row_big_errors:
                cell.fill = PatternFill(start_color="e06666", end_color="e06666", fill_type='solid')

                within_idx = within_row_big_errors.index(cell.value)
                error_cell = ws.cell(row=row_idx, column=max_columns + 1)
                existing_str = str(error_cell.value) if error_cell.value is not None else ""
                if existing_str == "":
                    error_cell.value = within_row_big_errors_core[within_idx]
                else:
                    error_cell.value = existing_str + "," + within_row_big_errors_core[within_idx]
        print(f'Added error corrections for {os.path.basename(excel_path)}')

    else:
        headers_to_color = headers
        start_col = 1

    # Define distinct light colors
    colors = [
        "FFB3BA", "FFDFBA", "FFFFBA", "BAFFC9", "BAE1FF",
        "D7BAFF", "FFC3F7", "BAFFD9", "FFE0BA", "D0BAFF"
    ]

    colors_note = [
    "#FFB3BA", "#FFDFBA", "#FFFFBA", "#BAFFC9", "#BAE1FF",
    "#D7BAFF", "#FFC3F7", "#BAFFD9", "#FFE0BA", "#D0BAFF"]

    # Assign colors to unique header names
    color_map = {}
    for col_name in headers_to_color:
        if col_name not in color_map:
            color_map[col_name] = colors[len(color_map) % len(colors)]

    # Apply fill color to each column
    col_idx = start_col
    for col_name in headers_to_color:

        fill = PatternFill(
            start_color=color_map[col_name],
            end_color=color_map[col_name],
            fill_type='solid'
        )

        # Color header
        ws.cell(row=1, column=col_idx).fill = fill

        # Color all data rows
        for row_idx in range(2, ws.max_row + 1):
            ws.cell(row=row_idx, column=col_idx).fill = fill

        col_idx += 1

    # #Define a thick border on the left side of a column 
    thick_side = Side(border_style="thick", color="000000") 
    thick_left_border = Border(left=thick_side)
    # Loop through columns starting at column 3 (Excel index), applying thick border every 2 columns 
    for col_idx in range(2, ws.max_column + 2, 2): # 3, 5, 7, 9 ... 
        for row_idx in range(1, ws.max_row + 1):
            ws.cell(row=row_idx, column=col_idx).border = thick_left_border

    # Medium thick on bottom of row 1
    medium_border = 1 if index_v else 0
    medium_side = Side(border_style="medium", color="000000")
    bottom_border = Border(bottom=medium_side)
    for col_idx in range(1, ws.max_column + medium_border):
        ws.cell(row=1, column=col_idx).border = bottom_border

    #freeze top row
    ws.freeze_panes = "A2"

    #add error column header
    if index_v:
        error_col_cell = ws.cell(row=1, column=max_columns + 1)
        error_col_cell.value = "Error Cores"
        error_col_cell.font = Font(bold=True)
        error_col_cell.alignment = Alignment(horizontal="center", vertical="center")

    #rename sheet
    ws.title = "Ice Core Depth Comparison"

    #create second page for legend/stats
    legend_sheet = wb.create_sheet(title="Legend & Stats")
    legend_sheet["A1"] = "Legend and Stats"
    legend_sheet["A1"].font = Font(size=14, bold=True)
    legend_sheet["A1"].alignment = Alignment(horizontal="center")
    legend_sheet.merge_cells('A1:D1')  # Merge first row for title

    # Add legend entries (example)
    legend = {
        "ffd966": "Rows flagged with values differing by > 0.1 but all less than < 0.25",
        "e06666": "Rows flagged with values maximum differing by > 0.25",
    }

    legend_row = 3
    legend_sheet[f"A{legend_row}"] = "Legend"
    legend_sheet[f"A{legend_row}"].font = Font(bold=True)

    legend_row +=1
    for key, desc in legend.items():
        cell = legend_sheet[f"A{legend_row}"]
        cell.fill = PatternFill(start_color=key, end_color=key, fill_type='solid')  # Set fill style
        legend_sheet[f"B{legend_row}"] = desc
        legend_row += 1

    # Add some stats (example)
    stats = {
        "Total Rows": ws.max_row - 1,  # assuming ws is your main sheet, -1 for header
        "Total Rows with Errors": f"{len(within_row_errors)} ({len(within_row_errors) / (ws.max_row - 1) * 100:.2f}%)",
        "Total Minor Errors (excl. Major)": f"{len(within_row_errors) - len(within_row_big_errors)} ({(len(within_row_errors) - len(within_row_big_errors)) / (ws.max_row - 1) * 100:.2f}%)",
        "Total Major Errors": f"{len(within_row_big_errors)} ({len(within_row_big_errors) / (ws.max_row - 1) * 100:.2f}%)",
        "Minimum Columns per Row": f"{min_cols_per[excel_path]}",
    }

    stats_row = legend_row + len(legend)
    legend_sheet[f"A{stats_row}"] = "Statistics"
    legend_sheet[f"A{stats_row}"].font = Font(bold=True)

    stats_row +=1
    for stat, val in stats.items():
        legend_sheet[f"A{stats_row}"] = stat
        legend_sheet[f"B{stats_row}"] = val
        stats_row += 1

    #adjust readability
    legend_sheet.column_dimensions['A'].width = 25  # wider column A in legend

    # Save workbook
    wb.save(excel_path)
    print(f"Styled and saved workbook at {excel_path}")


Loaded workbook /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged.xlsx for styling
Added error corrections for bicc_aa_14cols_merged.xlsx
Styled and saved workbook at /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged.xlsx
Loaded workbook /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged_3plus.xlsx for styling
Added error corrections for bicc_aa_14cols_merged_3plus.xlsx
Styled and saved workbook at /Users/quinnmackay/Desktop/temp/bicc_aa_14cols_merged_3plus.xlsx
