In [None]:
import pandas as pd

# Load the CSV files
meta_old = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/Master_dissertation_data/metadata_after_merge.csv",
                        header=0, sep=",", index_col=0)

patientoverview = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/Master_dissertation_data/Metadata_table_clean.csv",
                              header=0, sep=";")

# Save the original index
meta_old_index = meta_old.index

# Work on a copy of meta_old
meta = meta_old.copy()

# Extract SABxxx from the 'orig.ident' column in meta
meta['exp'] = meta['orig.ident'].str.extract(r'(SAB\d{3})')

# Merge without duplicating rows (ensuring no row expansion)
meta = meta.merge(patientoverview[['exp', 'day', 'Sorting']].drop_duplicates(), on='exp', how='left')

# Convert patientoverview to a dictionary for faster lookups
patient_dict = {(row['exp'], row['MultiSeq.ID']): row for _, row in patientoverview.iterrows()}
exp_dict = {row['exp']: row for _, row in patientoverview.iterrows()}  # Lookup based on exp only

# Ensure new columns exist before filling them in the loop
for col in ["Patient", "sample ID", "sample type", "sample nr", "Nr cells sorted"]:
    meta[col] = pd.NA  # Initialize columns to avoid missing key errors

# Loop through each row in meta and assign matching values
for index, row in meta.iterrows():
    exp_val = row['exp']
    multi_val = row['MultiSeq.ID']

    # If MultiSeq.ID is "Negative" or "Doublet", keep NA values
    if multi_val in ["Negative", "Doublet"]:
        continue  # Skip to the next iteration

    key = (exp_val, multi_val)

    # First try exact match on exp + MultiSeq.ID
    if key in patient_dict:
        patient_row = patient_dict[key]
    # If MultiSeq.ID is NaN, use only exp to fetch values
    elif pd.isna(multi_val) and exp_val in exp_dict:
        patient_row = exp_dict[exp_val]
    else:
        continue  # Skip if no matching data is found

    # Copy values to the meta table
    meta.at[index, "Patient"] = patient_row["Patient"]
    meta.at[index, "sample ID"] = patient_row["sample ID"]
    meta.at[index, "sample type"] = patient_row["sample type"]
    meta.at[index, "sample nr"] = patient_row["sample nr"]
    meta.at[index, "Nr cells sorted"] = patient_row["Nr cells sorted"]

# Check row count before reassigning index
if len(meta) == len(meta_old_index):
    meta.index = meta_old_index  # Restore the original index safely
else:
    print(f"Warning: Row count mismatch! meta has {len(meta)} rows, but the original had {len(meta_old_index)}.")

# Print unique exp values with the newly added columns for verification
print(meta[['exp', 'MultiSeq.ID', 'Patient', 'sample ID', 'sample type', 'sample nr', 'Nr cells sorted', "Sorting", "day"]].drop_duplicates())


  meta_old = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/Master_dissertation_data/metadata_after_merge.csv",


                              exp MultiSeq.ID  Patient    sample ID  \
SAB001_AAACGGGGTGTGGCTC-1  SAB001     Doublet     <NA>         <NA>   
SAB001_AAACGGGTCTCTAAGG-1  SAB001    Hashtag7     HC5   230309-3012   
SAB001_ACATCAGCATGATCCA-1  SAB001    Hashtag5  AML0028   AMBM120215   
SAB001_ACTGTCCCATCACAAC-1  SAB001    Hashtag1      HC1  230309-1218   
SAB001_AGAATAGTCTGCTGCT-1  SAB001    Negative     <NA>         <NA>   
SAB001_GTAACGTGTGGACGAT-1  SAB001    Hashtag2      HC2  230313-3523   
SAB002_AAACCTGAGAACAACT-1  SAB002         NaN  AML0028   AMBM120215   
SAB003_AAACCTGAGAGCAATT-1  SAB003         NaN  AML0028   AMBM120215   
SAB004_AAACCTGAGACAAGCC-1  SAB004         NaN  AML1744   AMBM101017   
SAB005_AAACCTGAGACTAAGT-1  SAB005         NaN  AML1744   AMBM101017   
SAB006_AAACGGGCACAGACTT-1  SAB006     Doublet     <NA>         <NA>   
SAB006_AACACGTAGCTGCCCA-1  SAB006    Hashtag3  AML0009  AMPB170815    
SAB006_AAGGAGCTCTCCGGTT-1  SAB006    Hashtag5  AML0028   AMBM120215   
SAB006

In [None]:
# Save updated metadata
meta.to_csv(r"/content/drive/MyDrive/Colab Notebooks/Master_dissertation_data/Final_full_metadata.csv", index=True)

print("Updated metadata saved successfully!")

Updated metadata saved successfully!
