## Improve original dataset

In [5]:
import pandas as pd
import numpy as np
import os
import re

# ==========================================
# 1. CONFIGURAZIONE
# ==========================================
DATASET_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset.xlsx"
DATASET_FINAL_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final.csv"

NEW_COLUMNS = [
    "dish_well",            # ID univoco
    "BLASTO_NY",            # 0/1
    "GV",                   # -1 = MII nativo (Sheet 2), 0-2 = IVM (Sheet 1)
    "sibling",              # 1 = MII nativo (Sheet 2), 0 = IVM (Sheet 1)
    "timing_GVBD",          # ore
    "timing_extrusion_PB"   # ore
]

In [6]:
# ==========================================
# 2. FUNZIONI UTILI
# ==========================================

def extract_pdb_info(raw_string):
    """
    Separa il nome del file PDB dallo stage.
    """
    if pd.isna(raw_string): return "", ""
    s = str(raw_string).strip()
    match = re.search(r'(.*?\.pdb)(.*)', s, re.IGNORECASE)
    if match:
        pdb_clean = match.group(1).strip().replace(".pdb", "").replace(".PDB", "")
        remainder = match.group(2).strip()
        return pdb_clean, remainder
    else:
        return s, ""

def map_gv_numeric(stage_string):
    """
    0: GV to GV
    1: GV to MI
    2: GV to MII
    """
    if not stage_string or pd.isna(stage_string): return np.nan
    s = stage_string.upper()
    if "MII" in s: return 2
    elif "MI" in s: return 1
    elif "GV" in s: return 0
    else: return np.nan

def clean_time(val):
    if pd.isna(val): return np.nan
    s = str(val).strip().replace(',', '.')
    if s in ['-', '']: return np.nan
    try: return float(s)
    except: return np.nan


In [7]:
# ==========================================
# 3. ELABORAZIONE SHEET 1 (IVM / GVs)
# ==========================================
print("--- Elaborazione Sheet 1 (GVs) ---")
try:
    df1 = pd.read_excel(DATASET_PATH, sheet_name=0, header=None)
    
    # Trova colonna PDB
    pdb_col_idx = -1
    for col in df1.columns:
        if df1[col].astype(str).str.contains(r'\.pdb', case=False, na=False).sum() > 5:
            pdb_col_idx = col
            break
            
    if pdb_col_idx == -1: raise ValueError("Colonna .pdb non trovata in Sheet 1")

    # Estrai subset colonne
    df1_subset = df1.iloc[:, pdb_col_idx : pdb_col_idx + 5].copy()
    df1_subset.columns = ['raw_combined', 'raw_well', 'raw_gvbd', 'raw_pb', 'raw_blasto']
    df1_clean = df1_subset[df1_subset['raw_combined'].astype(str).str.contains(r'\.pdb', case=False, na=False)].copy()
    
    # 1. Pulizia Nome e Stage
    extracted_data = df1_clean['raw_combined'].apply(extract_pdb_info)
    df1_clean['clean_name'] = extracted_data.apply(lambda x: x[0])
    df1_clean['stage_info'] = extracted_data.apply(lambda x: x[1])
    
    # 2. Assegnazione GV (0, 1, 2)
    df1_clean['GV'] = df1_clean['stage_info'].apply(map_gv_numeric)
    
    # 3. Assegnazione Sibling (Regola: Sheet 1 = 0)
    df1_clean['sibling'] = 0
    
    # 4. Altre conversioni
    df1_clean['well_clean'] = pd.to_numeric(df1_clean['raw_well'], errors='coerce').fillna(0).astype(int).astype(str)
    df1_clean['timing_GVBD'] = df1_clean['raw_gvbd'].apply(clean_time)
    df1_clean['timing_extrusion_PB'] = df1_clean['raw_pb'].apply(clean_time)
    df1_clean['BLASTO_NY'] = pd.to_numeric(df1_clean['raw_blasto'], errors='coerce').fillna(0).astype(int)
    
    df1_final = df1_clean[['clean_name', 'well_clean', 'GV', 'sibling', 'BLASTO_NY', 'timing_GVBD', 'timing_extrusion_PB']]
    print(f"Sheet 1 processato: {len(df1_final)} righe.")

except Exception as e:
    print(f"Errore Sheet 1: {e}")
    df1_final = pd.DataFrame()

# ==========================================
# 4. ELABORAZIONE SHEET 2 (MII / Siblings)
# ==========================================
print("--- Elaborazione Sheet 2 (Siblings) ---")
try:
    df2 = pd.read_excel(DATASET_PATH, sheet_name=1)
    
    cols = df2.columns.astype(str).str.lower()
    c_pdb = df2.columns[cols.str.contains('pdb')][0]
    c_well = df2.columns[cols.str.contains('well')][0]
    c_blasto = df2.columns[cols.str.contains('blasto')][0]
    
    df2_clean = df2.copy()
    
    # 1. Pulizia Nome
    df2_clean['clean_name'] = df2_clean[c_pdb].apply(lambda x: str(x).replace('.pdb', '').strip())
    df2_clean['well_clean'] = pd.to_numeric(df2_clean[c_well], errors='coerce').fillna(0).astype(int).astype(str)
    
    # 2. Assegnazione GV (Regola: Sheet 2 = -1)
    df2_clean['GV'] = -1
    
    # 3. Assegnazione Sibling (Regola: Sheet 2 = 1)
    df2_clean['sibling'] = 1
    
    # 4. Altre conversioni
    df2_clean['BLASTO_NY'] = pd.to_numeric(df2_clean[c_blasto], errors='coerce').fillna(0).astype(int)
    df2_clean['timing_GVBD'] = np.nan
    df2_clean['timing_extrusion_PB'] = np.nan
    
    df2_final = df2_clean[['clean_name', 'well_clean', 'GV', 'sibling', 'BLASTO_NY', 'timing_GVBD', 'timing_extrusion_PB']]
    print(f"Sheet 2 processato: {len(df2_final)} righe.")

except Exception as e:
    print(f"Errore Sheet 2: {e}")
    df2_final = pd.DataFrame()

# ==========================================
# 5. MERGE E SALVATAGGIO
# ==========================================

df_total = pd.concat([df1_final, df2_final], ignore_index=True)

# Creazione ID Univoco
df_total['dish_well'] = df_total['clean_name'] + "_" + df_total['well_clean']

# Selezione e pulizia finale
df_output = df_total[NEW_COLUMNS]
df_output = df_output[df_output['dish_well'].str.len() > 3] # Rimuove righe vuote

# Salvataggio
os.makedirs(os.path.dirname(DATASET_FINAL_PATH), exist_ok=True)
df_output.to_csv(DATASET_FINAL_PATH, index=False)

print("\n=== FATTO ===")
print(f"File salvato in: {DATASET_FINAL_PATH}")
print("\nDistribuzione GV (dovresti vedere -1, 0, 1, 2):")
print(df_output['GV'].value_counts())
print("\nDistribuzione Sibling (dovresti vedere 0 e 1):")
print(df_output['sibling'].value_counts())
print("\nAnteprima dati:")
print(df_output.head())

--- Elaborazione Sheet 1 (GVs) ---
Sheet 1 processato: 96 righe.
--- Elaborazione Sheet 2 (Siblings) ---
Sheet 2 processato: 60 righe.

=== FATTO ===
File salvato in: /home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final.csv

Distribuzione GV (dovresti vedere -1, 0, 1, 2):
GV
-1    60
 2    47
 0    37
 1    12
Name: count, dtype: int64

Distribuzione Sibling (dovresti vedere 0 e 1):
sibling
0    96
1    60
Name: count, dtype: int64

Anteprima dati:
                  dish_well  BLASTO_NY  GV  sibling  timing_GVBD  \
0  D2016.11.14_S1895_I106_6          1   2        0          4.8   
1  D2016.11.14_S1895_I106_7          0   2        0          4.8   
2  D2017.09.15_S0770_I631_2          0   2        0          NaN   
3  D2017.09.15_S0770_I631_3          1   2        0          NaN   
4  D2017.09.15_S0770_I631_4          0   2        0          1.2   

   timing_extrusion_PB  
0                 20.3  
1                 16.1  
2                 20.7  
3                 16.9  
4    