## Improve original dataset

In [None]:
import pandas as pd
import numpy as np
import os
import re

# ==========================================
# 1. CONFIGURATION
# ==========================================
# Paths
DATASET_1_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset.xlsx"
DATASET_2_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/DB_MII_to_blasto.xlsx"
OUTPUT_PATH    = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv"

# Columns config
NEW_COLUMNS = [
    "dish_well",            # Unique ID
    "BLASTO_NY",            # 0 = No, 1 = Yes
    "GV",                   # -1=MII (Native), 0-2=IVM, 3=Rescue MII
    "sibling",              # 1=MII (Native), 0=IVM/Rescue
    "timing_GVBD",          # hours
    "timing_extrusion_PB",  # hours
    "Note"                  # Classification (e.g., GV_to_M2, rM2_to_blasto)
]

In [None]:
# ==========================================
# 2. HELPER FUNCTIONS
# ==========================================
def extract_pdb_info(raw_string):
    if pd.isna(raw_string): return "", ""
    s = str(raw_string).strip()
    match = re.search(r'(.*?\.pdb)(.*)', s, re.IGNORECASE)
    if match:
        pdb_clean = match.group(1).strip().replace(".pdb", "").replace(".PDB", "")
        remainder = match.group(2).strip()
        return pdb_clean, remainder
    else:
        return s, ""

def map_gv_numeric(stage_string):
    if not stage_string or pd.isna(stage_string): return np.nan
    s = stage_string.upper()
    if "MII" in s: return 2
    elif "MI" in s: return 1
    elif "GV" in s: return 0
    else: return np.nan

def clean_time(val):
    if pd.isna(val): return np.nan
    s = str(val).strip().replace(',', '.')
    if s in ['-', '']: return np.nan
    try: return float(s)
    except: return np.nan

In [None]:
# ==========================================
# 3. PROCESSING DATASET 1 - SHEET 1 (IVM/GVs)
# ==========================================
print("--- Processing Dataset 1: Sheet 1 (GVs) ---")
try:
    df1 = pd.read_excel(DATASET_1_PATH, sheet_name=0, header=None)
    
    # Dynamic column finding
    pdb_col_idx = -1
    for col in df1.columns:
        if df1[col].astype(str).str.contains(r'\.pdb', case=False, na=False).sum() > 5:
            pdb_col_idx = col
            break
            
    if pdb_col_idx == -1: raise ValueError("Column .pdb not found in Sheet 1")

    df1_subset = df1.iloc[:, pdb_col_idx : pdb_col_idx + 5].copy()
    df1_subset.columns = ['raw_combined', 'raw_well', 'raw_gvbd', 'raw_pb', 'raw_blasto']
    df1_clean = df1_subset[df1_subset['raw_combined'].astype(str).str.contains(r'\.pdb', case=False, na=False)].copy()
    
    # 1. Clean Name and Stage
    extracted_data = df1_clean['raw_combined'].apply(extract_pdb_info)
    df1_clean['clean_name'] = extracted_data.apply(lambda x: x[0])
    df1_clean['stage_info'] = extracted_data.apply(lambda x: x[1])
    
    # 2. Logic Assignment (GV & Sibling)
    df1_clean['GV'] = df1_clean['stage_info'].apply(map_gv_numeric)
    df1_clean['sibling'] = 0
    
    # 3. Time and Blasto
    df1_clean['well_clean'] = pd.to_numeric(df1_clean['raw_well'], errors='coerce').fillna(0).astype(int).astype(str)
    df1_clean['timing_GVBD'] = df1_clean['raw_gvbd'].apply(clean_time)
    df1_clean['timing_extrusion_PB'] = df1_clean['raw_pb'].apply(clean_time)
    df1_clean['BLASTO_NY'] = pd.to_numeric(df1_clean['raw_blasto'], errors='coerce').fillna(0).astype(int)
    
    # 4. Generate Note
    # Logic: GV=0 -> GV_to_GV, GV=1 -> GV_to_M1, GV=2 -> GV_to_M2
    def get_ivm_note(gv_code):
        if gv_code == 0: return "GV_to_GV"
        elif gv_code == 1: return "GV_to_M1"
        elif gv_code == 2: return "GV_to_M2"
        else: return "Unknown_IVM"
    
    df1_clean['Note'] = df1_clean['GV'].apply(get_ivm_note)

    # 5. Create ID
    df1_clean['dish_well'] = df1_clean['clean_name'] + "_" + df1_clean['well_clean']
    
    df1_final = df1_clean[NEW_COLUMNS]
    print(f"Sheet 1 processed: {len(df1_final)} rows.")
except Exception as e:
    print(f"Error Sheet 1: {e}")
    df1_final = pd.DataFrame()

# ==========================================
# 4. PROCESSING DATASET 1 - SHEET 2 (Native MII)
# ==========================================
print("--- Processing Dataset 1: Sheet 2 (Siblings) ---")
try:
    df2 = pd.read_excel(DATASET_1_PATH, sheet_name=1)
    
    cols = df2.columns.astype(str).str.lower()
    c_pdb = df2.columns[cols.str.contains('pdb')][0]
    c_well = df2.columns[cols.str.contains('well')][0]
    c_blasto = df2.columns[cols.str.contains('blasto')][0]
    
    df2_clean = df2.copy()
    
    # 1. Cleaning
    df2_clean['clean_name'] = df2_clean[c_pdb].apply(lambda x: str(x).replace('.pdb', '').strip())
    df2_clean['well_clean'] = pd.to_numeric(df2_clean[c_well], errors='coerce').fillna(0).astype(int).astype(str)
    
    # 2. Logic
    df2_clean['GV'] = -1
    df2_clean['sibling'] = 1
    df2_clean['BLASTO_NY'] = pd.to_numeric(df2_clean[c_blasto], errors='coerce').fillna(0).astype(int)
    df2_clean['timing_GVBD'] = np.nan
    df2_clean['timing_extrusion_PB'] = np.nan
    
    # 3. Generate Note
    # Logic: M2 (Native) -> Blasto status
    df2_clean['Note'] = df2_clean['BLASTO_NY'].apply(lambda x: "M2_to_blasto" if x == 1 else "M2_to_no_blasto")
    
    # 4. Create ID
    df2_clean['dish_well'] = df2_clean['clean_name'] + "_" + df2_clean['well_clean']
    
    df2_final = df2_clean[NEW_COLUMNS]
    print(f"Sheet 2 processed: {len(df2_final)} rows.")
except Exception as e:
    print(f"Error Sheet 2: {e}")
    df2_final = pd.DataFrame()

# ==========================================
# 5. PROCESSING DATASET 2 (Rescue MII)
# ==========================================
print("--- Processing Dataset 2 (Rescue MII) ---")
try:
    raw_d2 = pd.read_excel(DATASET_2_PATH, header=None)
    raw_list = raw_d2.iloc[:, 0].dropna().astype(str).tolist()
    
    parsed_rows = []
    current_blasto_status = None 
    
    for item in raw_list:
        clean_item = item.strip()
        
        # Check Headers
        if "NO BLASTO" in clean_item.upper():
            current_blasto_status = 0
            continue
        elif "BLASTO" in clean_item.upper(): 
            current_blasto_status = 1
            continue
            
        if current_blasto_status is not None:
            new_id = re.sub(r'_wells_', '_', clean_item, flags=re.IGNORECASE)
            
            if 'D20' in new_id:
                # Generate Note logic for Rescue MII
                note_val = "rM2_to_blasto" if current_blasto_status == 1 else "rM2_to_no_blasto"
                
                parsed_rows.append({
                    'dish_well': new_id,
                    'BLASTO_NY': current_blasto_status,
                    'GV': 3,          # Code for Rescue
                    'sibling': 0,
                    'timing_GVBD': np.nan,
                    'timing_extrusion_PB': np.nan,
                    'Note': note_val
                })

    df3_final = pd.DataFrame(parsed_rows)
    for col in NEW_COLUMNS:
        if col not in df3_final.columns:
            df3_final[col] = np.nan
            
    df3_final = df3_final[NEW_COLUMNS]
    print(f"Dataset 2 processed: {len(df3_final)} rows.")

except Exception as e:
    print(f"Error Dataset 2: {e}")
    df3_final = pd.DataFrame()

# ==========================================
# 6. MERGE AND EXPORT
# ==========================================
df_total = pd.concat([df1_final, df2_final, df3_final], ignore_index=True)

df_total = df_total[df_total['dish_well'].notna()]
df_total = df_total[df_total['dish_well'].astype(str).str.len() > 3]

# Handle Duplicates
duplicates = df_total['dish_well'].duplicated(keep='first')
if duplicates.sum() > 0:
    print(f"\nWarning: {duplicates.sum()} duplicate IDs found. Keeping first occurrence.")
    df_total = df_total[~duplicates]

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
df_total.to_csv(OUTPUT_PATH, index=False)

print("\n=== DONE ===")
print(f"File saved to: {OUTPUT_PATH}")

print("\nDistribution of 'Note':")
print(df_total['Note'].value_counts())

print("\nPreview:")
print(df_total.head())

--- Processing Dataset 1: Sheet 1 (GVs) ---
Sheet 1 processed: 96 rows.
--- Processing Dataset 1: Sheet 2 (Siblings) ---
Sheet 2 processed: 60 rows.
--- Processing Dataset 2 (Rescue MII) ---
Dataset 2 processed: 51 rows.


=== DONE ===
File saved to: /home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv

Distribution of 'Note':
Note
GV_to_M2            47
rM2_to_no_blasto    41
M2_to_blasto        40
GV_to_GV            36
M2_to_no_blasto     20
GV_to_M1            12
rM2_to_blasto        8
Name: count, dtype: int64

Preview:
                  dish_well  BLASTO_NY  GV  sibling  timing_GVBD  \
0  D2016.11.14_S1895_I106_6          1   2        0          4.8   
1  D2016.11.14_S1895_I106_7          0   2        0          4.8   
2  D2017.09.15_S0770_I631_2          0   2        0          NaN   
3  D2017.09.15_S0770_I631_3          1   2        0          NaN   
4  D2017.09.15_S0770_I631_4          0   2        0          1.2   

   timing_extrusion_PB      Note  
0     

In [12]:
print("\nPreview:")
print(df_total.iloc[10:20])


Preview:
                        dish_well  BLASTO_NY  GV  sibling  timing_GVBD  \
10  D2019.02.15_S012161_I0631_D_1          0   2        0          0.6   
11  D2019.02.15_S012161_I0631_D_2          0   2        0          5.6   
12  D2019.02.15_S012161_I0631_D_5          1   2        0          4.6   
13  D2019.02.15_S012161_I0631_D_6          0   2        0          NaN   
14   D2019.03.14_S00126_I0758_D_3          0   2        0          7.1   
15   D2019.03.14_S00126_I0758_D_6          0   2        0          9.6   
16   D2019.03.14_S00126_I0758_D_7          0   2        0          9.2   
17   D2019.05.26_S01883_I0406_D_9          0   2        0          1.4   
18  D2019.05.26_S01883_I0406_D_10          0   2        0          2.4   
19  D2019.05.26_S01883_I0406_D_11          0   2        0          0.9   

    timing_extrusion_PB      Note  
10                 13.8  GV_to_M2  
11                 20.8  GV_to_M2  
12                 20.6  GV_to_M2  
13                 17.3  GV_to_