In [None]:
# 7. FINAL LABELING (REVISED: COMPLETE SOURCES & RELAXED BOUNDARIES)
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

print("Performing Final Labeling (Mode: 4 Sources + Relaxed FLAME)...")

if 'df_gabungan' in locals():
    df_final = df_gabungan.copy()
else:
    raise ValueError("df_gabungan not found!")

# 1. Ensure Numeric Types
cols_numeric = ['evolstage_flame', 'phot_g_mean_mag', 'parallax', 'bp_rp0', 'abs_G0']
for c in cols_numeric:
    if c in df_final.columns:
        df_final[c] = pd.to_numeric(df_final[c], errors='coerce')

# --- A. SOURCE-BASED LABELING LOGIC (HIGHEST PRIORITY) ---
# We trust our specific query sources explicitly.

df_final['evolutionary_phase'] = 'Other'
source_str = df_final['dataset_source'].astype(str).str.lower()

# 1. Supergiant (Scientific Papers)
mask_sg = source_str.str.contains('supergiant|scientific|pantaleoni|messineo|hohle', regex=True)
df_final.loc[mask_sg, 'evolutionary_phase'] = 'Supergiant'

# 2. White Dwarf (Gentile Fusillo Catalog)
mask_wd = source_str.str.contains('gentile|wd', regex=True)
df_final.loc[mask_wd, 'evolutionary_phase'] = 'White Dwarf'

# 3. Giant (Specific Giant Query) - NEW!
# If data comes from the specific Giant query, label immediately as Giant
mask_src_giant = source_str.str.contains('giant_query', regex=True)
df_final.loc[mask_src_giant, 'evolutionary_phase'] = 'Giant'

# --- B. FLAME-BASED LOGIC (FOR MAIN DATASET) ---
# Target: 'Other' data (usually from Gaia_DR3_Main) not yet labeled above
mask_target = (df_final['evolutionary_phase'] == 'Other')

# Range Definitions (As per request: 100-419 MS, 420-489 Sub, >=490 Giant)
# Giant threshold set to 490 to avoid overlap with 489.

# A. MAIN SEQUENCE (Extended to 419)
mask_ms = df_final['evolstage_flame'].between(100, 419)
df_final.loc[mask_target & mask_ms, 'evolutionary_phase'] = 'Main Sequence'

# B. SUB-GIANT (Narrowed: 420 - 489)
mask_sub = df_final['evolstage_flame'].between(420, 489)
df_final.loc[mask_target & mask_sub, 'evolutionary_phase'] = 'Sub-Giant'

# C. GIANT (Starts from 490 / Base of RGB)
mask_giant_flame = df_final['evolstage_flame'] >= 490
df_final.loc[mask_target & mask_giant_flame, 'evolutionary_phase'] = 'Giant'

# --- C. PHYSICS RESCUE (SAFETY NET) ---
# Filling in data where FLAME code is NaN but HR Diagram position is distinct

# Rescue Giant (Bright & Red)
mask_rescue_giant = (
    (df_final['evolutionary_phase'] == 'Other') &
    (df_final['abs_G0'] < 3.0) &
    (df_final['bp_rp0'] > 1.0)
)
df_final.loc[mask_rescue_giant, 'evolutionary_phase'] = 'Giant'

# Rescue MS (Remaining faint stars)
mask_rescue_ms = (
    (df_final['evolutionary_phase'] == 'Other') &
    (df_final['abs_G0'] > 3.5) &
    (df_final['abs_G0'] < 15.0)
)
df_final.loc[mask_rescue_ms, 'evolutionary_phase'] = 'Main Sequence'

# Cleanup: Drop remaining 'Other' (usually noise/errors)
df_final = df_final[df_final['evolutionary_phase'] != 'Other'].copy()

# --- D. REGRESSION TARGET UNIFICATION (FINAL MERGE) ---
print("\nUnifying Regression Targets (Mass & Age)...")

# Initialize
df_final['mass_final'] = np.nan
df_final['age_final'] = np.nan

# 1. Take from FLAME (Primary Priority for Living Stars)
df_final['mass_final'] = df_final['mass_final'].fillna(df_final['mass_flame'])
df_final['age_final'] = df_final['age_final'].fillna(df_final['age_flame'])

# 2. Take from WD (For White Dwarfs)
mask_wd_final = df_final['evolutionary_phase'] == 'White Dwarf'
df_final.loc[mask_wd_final, 'mass_final'] = df_final.loc[mask_wd_final, 'mass_wd']
df_final.loc[mask_wd_final, 'age_final'] = df_final.loc[mask_wd_final, 'age_wd_cooling']

# Check label availability
df_final['has_labels'] = df_final['mass_final'].notnull() & df_final['age_final'].notnull()

# --- E. REPORT & VISUALIZATION ---
print("\nCLASS DISTRIBUTION REPORT (FINAL):")
print("-" * 40)
print(df_final['evolutionary_phase'].value_counts())
print("-" * 40)

# HR Diagram Visualization
plt.figure(figsize=(10, 8))
hue_order = ['White Dwarf', 'Main Sequence', 'Sub-Giant', 'Giant', 'Supergiant']
palette_dict = {
    'Main Sequence': 'tab:blue',
    'Sub-Giant': 'orange',
    'Giant': 'tab:red',
    'Supergiant': 'purple',
    'White Dwarf': 'cyan'
}

# Sampling for faster plotting
sample_df = df_final.groupby('evolutionary_phase').apply(lambda x: x.sample(min(len(x), 10000))).reset_index(drop=True)

sns.scatterplot(
    data=sample_df,
    x='bp_rp0', y='abs_G0',
    hue='evolutionary_phase', hue_order=hue_order,
    palette=palette_dict, s=10, alpha=0.6, edgecolor='none'
)

plt.gca().invert_yaxis()
plt.title("HR Diagram: 4 Integrated Datasets")
plt.xlabel("Color ($G_{BP} - G_{RP}$)")
plt.ylabel("Absolute Magnitude ($M_G$)")
plt.legend(title='Evolutionary Phase', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Update df_gabungan
df_gabungan = df_final.copy()

In [None]:
# 8. DATA VISUALIZATION: FINAL VISUALIZATION SUITE (SCIENTIFIC PUBLICATION QUALITY)
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import shutil
from astropy.coordinates import SkyCoord
from astropy import units as u
from mpl_toolkits.mplot3d import Axes3D

# --- 0. JOURNAL AESTHETICS CONFIGURATION ---
# Configure fonts and line weights to mimic scientific publications
plt.rcParams['font.family'] = 'serif'          # Use serif fonts (formal)
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['axes.linewidth'] = 1.2           # Thicker axis lines
plt.rcParams['xtick.major.width'] = 1.2
plt.rcParams['ytick.major.width'] = 1.2
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['legend.fontsize'] = 11
# Disable grid by default
plt.rcParams['axes.grid'] = False

# --- 1. SETUP & LOAD DATA ---
output_dir = "/kaggle/working/Final_Thesis_Visualizations"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
print(f"Output Folder: {output_dir}")

print("Loading Data & Coordinates...")
if 'df_gabungan' in locals():
    df = df_gabungan.copy()
else:
    try:
        df = pd.read_csv("FINAL_MASTER_DATASET.csv")
    except:
        print("Error: Dataset not found.")
        df = pd.DataFrame()

if not df.empty:
    # 1. TRANSLATE LABELS & RENAME COLUMN
    label_translation = {
        'Deret Utama': 'Main Sequence',
        'Sub-Raksasa': 'Subgiant',
        'Raksasa': 'Red Giant',
        'Maharaksasa': 'Supergiant',
        'Katai Putih': 'White Dwarf'
    }

    # Create a standardized English column
    target_col = 'fase_evolusi_final'
    if target_col in df.columns:
        df['evolutionary_phase'] = df[target_col].replace(label_translation)
    else:
        # Fallback if already renamed
        df['evolutionary_phase'] = df['evolutionary_phase'] if 'evolutionary_phase' in df.columns else 'Unknown'

    # 2. COORDINATE CALCULATIONS
    df_geo = df[df['parallax'] > 0.1].copy()
    c = SkyCoord(
        ra=df_geo['ra'].values*u.deg,
        dec=df_geo['dec'].values*u.deg,
        distance=(1000/df_geo['parallax'].values)*u.pc,
        frame='icrs'
    ).galactic

    df_geo['X'], df_geo['Y'], df_geo['Z'] = c.cartesian.x.value, c.cartesian.y.value, c.cartesian.z.value
    df_geo['l_rad'], df_geo['b_rad'] = c.l.wrap_at(180*u.deg).radian, c.b.radian

    # Merge back needed geo columns to main df for 3D plotting convenience if needed
    # (Optional, keeping them separate in df_geo is cleaner for memory)

    print(f"Data Ready: {len(df):,} rows.")

    # 3. SCIENTIFIC PALETTE (High Contrast)
    palette_dict = {
        'Main Sequence': 'tab:blue',
        'Subgiant': 'tab:orange',
        'Red Giant': 'tab:red',
        'Supergiant': 'tab:purple',
        'White Dwarf': 'tab:cyan'
    }
    hue_order = ['White Dwarf', 'Main Sequence', 'Subgiant', 'Red Giant', 'Supergiant']

    # ==========================================================================
    # PLOT 1: HR DIAGRAM (CLEAN)
    # ==========================================================================
    print("[1/11] Plotting HR Diagram...")
    plt.figure(figsize=(10, 8))
    # Sample for performance if dataset is huge
    sample_hr = df.groupby('evolutionary_phase').apply(lambda x: x.sample(min(len(x), 100000))).reset_index(drop=True)

    sns.scatterplot(
        data=sample_hr, x='bp_rp0', y='abs_G0',
        hue='evolutionary_phase', hue_order=hue_order, palette=palette_dict,
        s=3, alpha=0.6, marker='o',
        edgecolor='none', linewidth=0
    )
    plt.gca().invert_yaxis()
    plt.title("Hertzsprung-Russell Diagram", fontweight='bold')
    plt.xlabel("Color Index ($G_{BP} - G_{RP}$)")
    plt.ylabel("Absolute Magnitude ($M_G$)")
    plt.legend(title="Evolutionary Phase", markerscale=3, frameon=True, framealpha=0.9, loc='upper right')
    sns.despine()
    plt.savefig(f"{output_dir}/1_HR_Diagram.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 2: CLASS DISTRIBUTION (CLEAN)
    # ==========================================================================
    print("[2/11] Plotting Class Distribution...")
    plt.figure(figsize=(10, 6))
    class_counts = df['evolutionary_phase'].value_counts()

    ax = sns.barplot(x=class_counts.values, y=class_counts.index, palette=palette_dict, edgecolor='none', linewidth=0)

    plt.xscale('log')
    plt.title("Stellar Class Distribution (Log Scale)", fontweight='bold')
    plt.xlabel("Number of Stars ($N$)")
    plt.bar_label(ax.containers[0], fmt='%.0f', padding=5, fontsize=10)
    sns.despine(left=True, bottom=False)
    plt.savefig(f"{output_dir}/2_Class_Distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 3: MASS HISTOGRAM (CLEAN STEP)
    # ==========================================================================
    print("[3/11] Plotting Mass Histogram...")
    plt.figure(figsize=(10, 6))
    df_mass = df[df['mass_final'] > 0].dropna(subset=['mass_final'])

    sns.histplot(
        data=df_mass, x='mass_final', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", fill=True,
        linewidth=0,
        log_scale=True, common_norm=False, alpha=0.7
    )
    plt.title("Stellar Mass Distribution", fontweight='bold')
    plt.xlabel("Mass ($M_{\odot}$)")
    plt.ylabel("Count")
    sns.despine()
    plt.savefig(f"{output_dir}/3_Mass_Distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 4: AGE HISTOGRAM (CLEAN STEP)
    # ==========================================================================
    print("[4/11] Plotting Age Histogram...")
    plt.figure(figsize=(10, 6))
    df_age = df[df['age_final'] > 0.001].dropna(subset=['age_final'])

    sns.histplot(
        data=df_age, x='age_final', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", fill=True,
        linewidth=0,
        log_scale=True, common_norm=False, alpha=0.7
    )
    plt.title("Stellar Age Distribution", fontweight='bold')
    plt.xlabel("Age (Gyr)")
    sns.despine()
    plt.savefig(f"{output_dir}/4_Age_Distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 5: GALACTIC MAP 2D (CLEAN)
    # ==========================================================================
    print("[5/11] Plotting 2D Galactic Map...")
    plt.figure(figsize=(12, 8))
    ax = plt.subplot(111, projection='aitoff')

    sample_map = df_geo.sample(min(len(df_geo), 200000), random_state=42)
    colors = sample_map['mass_final'].fillna(sample_map['mass_final'].median())

    sc = ax.scatter(
        sample_map['l_rad'], sample_map['b_rad'],
        c=colors, cmap='magma', s=1, alpha=0.7,
        edgecolors='none', linewidths=0
    )
    ax.set_title("Galactic Spatial Distribution (Mass Weighted)", fontweight='bold')

    # Clean tick labels
    ax.tick_params(axis='x', colors='gray', labelsize=8)
    ax.tick_params(axis='y', colors='gray', labelsize=8)

    cb = plt.colorbar(sc, ax=ax, orientation='horizontal', pad=0.1, shrink=0.5)
    cb.set_label("Mass ($M_{\odot}$)")
    cb.outline.set_linewidth(0)
    plt.savefig(f"{output_dir}/5_Galactic_Map_2D.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 6: CORRELATION MATRIX (CLEAN)
    # ==========================================================================
    print("[6/11] Plotting Correlation Matrix...")
    plt.figure(figsize=(9, 8))
    features = ['phot_g_mean_mag', 'phot_bp_mean_mag', 'phot_rp_mean_mag', 'bp_rp0', 'parallax', 'ruwe', 'abs_G0', 'mass_final', 'age_final']

    # Calculate correlation only on available columns
    available_features = [f for f in features if f in df.columns]
    corr = df[available_features].corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))

    sns.heatmap(
        corr, mask=mask, annot=True, fmt=".2f",
        cmap='coolwarm', vmin=-1, vmax=1,
        linewidths=0,
        square=True, cbar_kws={"shrink": .7, "ticks": [-1, -0.5, 0, 0.5, 1]}
    )
    plt.title("Feature Correlation Matrix", fontweight='bold')
    plt.savefig(f"{output_dir}/6_Correlation_Matrix.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 7: GALACTIC MAP 3D (CLEAN)
    # ==========================================================================
    print("[7/11] Plotting 3D Galaxy Map...")
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')

    # We need to map colors for the specific sample in df_geo
    # (Since df_geo might not have the new 'evolutionary_phase' column merged yet, we map it from the main df index if needed,
    # but simplest is to join or recalculate labels for df_geo.
    # Here assuming df indices align or just re-mapping based on logic is safer, but for brevity using df_geo index match)

    sample_3d = df_geo.sample(min(len(df_geo), 10000), random_state=42) # Increased sample for better view

    # Get labels for these specific indices
    labels_3d = df.loc[sample_3d.index, 'evolutionary_phase']
    colors_3d = labels_3d.map(palette_dict)

    ax.scatter(
        sample_3d['X'], sample_3d['Y'], sample_3d['Z'],
        c=colors_3d, s=2, alpha=0.5,
        edgecolors='none', linewidths=0
    )

    # Clean 3D panes
    ax.xaxis.pane.fill = False; ax.yaxis.pane.fill = False; ax.zaxis.pane.fill = False
    ax.grid(False)

    ax.set_title("3D Heliocentric Distribution", fontweight='bold')
    ax.set_xlabel("X (pc)"); ax.set_ylabel("Y (pc)"); ax.set_zlabel("Z (pc)")
    ax.set_xlim(-3000, 3000); ax.set_ylim(-3000, 3000); ax.set_zlim(-1500, 1500)

    # Custom Legend for 3D
    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor=c, label=l, markersize=8) for l, c in palette_dict.items()]
    ax.legend(handles=legend_elements, loc='upper right', frameon=False)
    plt.savefig(f"{output_dir}/7_Galactic_Map_3D.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 8: VERTICAL STRUCTURE (CLEAN STEP)
    # ==========================================================================
    print("[8/11] Plotting Vertical Structure...")
    plt.figure(figsize=(12, 6))
    data_z = df_geo[np.abs(df_geo['Z']) < 2000].copy()

    # Map labels to this slice
    data_z['evolutionary_phase'] = df.loc[data_z.index, 'evolutionary_phase']

    sns.histplot(
        data=data_z, x='Z', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", fill=True,
        linewidth=0,
        stat="density", common_norm=False, bins=100, alpha=0.6
    )
    plt.yscale('log')
    plt.axvline(0, color='black', linestyle='--', linewidth=0.8, alpha=0.5, label='Galactic Plane')
    plt.title("Vertical Distribution ($Z$-Height)", fontweight='bold')
    plt.xlabel("Vertical Distance (pc)"); plt.ylabel("Density (Log)")
    plt.xlim(-1500, 1500)
    sns.despine()
    plt.savefig(f"{output_dir}/8_Vertical_Structure.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 9: SKY DENSITY MAP (CLEAN)
    # ==========================================================================
    print("[9/11] Plotting Sky Density Map...")
    fig = plt.figure(figsize=(14, 8))
    ax = fig.add_subplot(111, projection='aitoff')

    hb = ax.hexbin(
        df_geo['l_rad'], df_geo['b_rad'],
        gridsize=150, cmap='plasma', bins='log', mincnt=1,
        edgecolors='face', linewidths=0
    )
    ax.set_title("Sky Density Map (Galactic Coordinates)", fontweight='bold', pad=20)
    ax.text(0, 0, '+ GC', ha='center', va='center', color='cyan', fontweight='bold', fontsize=12)
    ax.grid(False)

    cb = plt.colorbar(hb, ax=ax, orientation='horizontal', pad=0.1, shrink=0.5)
    cb.set_label('Log Star Count ($log_{10} N$)')
    cb.outline.set_linewidth(0)
    plt.savefig(f"{output_dir}/9_Sky_Density.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ==========================================================================
    # PLOT 10 & 11: TEMP & MAG HISTOGRAM (CLEAN STEP)
    # ==========================================================================
    print("[10 & 11] Plotting Temp & Mag Histograms...")

    # Temp
    plt.figure(figsize=(10, 6))
    df_teff = df[(df['teff_gspphot'] > 0) & (df['teff_gspphot'] < 30000)]
    sns.histplot(
        data=df_teff, x='teff_gspphot', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", fill=True, linewidth=0,
        log_scale=(False, True), common_norm=False, bins=50, alpha=0.7
    )
    plt.title("Effective Temperature Distribution", fontweight='bold')
    plt.xlabel("Teff (K)"); plt.xlim(0, 20000)
    sns.despine()
    plt.savefig(f"{output_dir}/10_Temp_Distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

    # Mag
    plt.figure(figsize=(10, 6))
    df_abs = df[df['abs_G0'].notnull()]
    sns.histplot(
        data=df_abs, x='abs_G0', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", fill=True, linewidth=0,
        log_scale=(False, True), common_norm=False, bins=50, alpha=0.7
    )
    plt.title("Luminosity Distribution", fontweight='bold')
    plt.xlabel("Absolute Magnitude $G$"); plt.gca().invert_xaxis()
    sns.despine()
    plt.savefig(f"{output_dir}/11_Mag_Distribution.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ZIP ALL
    shutil.make_archive("/kaggle/working/Final_Thesis_Visualizations", 'zip', output_dir)
    print("\nDONE! Please download 'Final_Thesis_Visualizations.zip'.")

else:
    print("Error: Data Empty.")

In [None]:
# 9. PREPROCESSING & FEATURE ENGINEERING (REVISED: TEFF INCLUDED)
# ==============================================================================
import numpy as np
import pandas as pd
from sklearn.utils import resample, class_weight
import gc

RANDOM_STATE = 42
print("Starting Preprocessing (Mode: Scientific, Kinematics & Memory Safe)...")

# --- 1. MEMORY EFFICIENT DATA LOADING ---
if 'df_gabungan' in locals():
    df = df_gabungan
    print(f"   -> Using df_gabungan ({len(df):,} rows).")
else:
    try:
        df = pd.read_csv("FINAL_MASTER_DATASET.csv")
        print(f"   -> Loaded CSV ({len(df):,} rows).")
    except:
        # Dummy fallback for testing purposes only
        print("Warning: File not found, creating dummy dataframe...")
        df = pd.DataFrame(columns=['phot_g_mean_mag', 'phot_bp_mean_mag', 'phot_rp_mean_mag',
                                   'parallax', 'dataset_source', 'teff_gspphot'])

# OPTIMIZATION: Downcast float64 to float32 to save RAM
cols_float = df.select_dtypes(include=['float64']).columns
df[cols_float] = df[cols_float].astype('float32')
gc.collect()

# ==============================================================================
# A. WHITE DWARF (WD) COLUMN DIAGNOSTICS & REPAIR
# ==============================================================================
print("\nDiagnosing WD Data...")
wd_mask = df['dataset_source'].astype(str).str.contains('Gentile|WD', case=False, regex=True)

# Column Name Standardizer
rename_map = {}
if 'mass_wd' not in df.columns:
    if 'MassH' in df.columns: rename_map['MassH'] = 'mass_wd'
    elif 'mass' in df.columns: rename_map['mass'] = 'mass_wd'

if 'age_wd_cooling' not in df.columns:
    if 'AgeH' in df.columns: rename_map['AgeH'] = 'age_wd_cooling'
    elif 'age' in df.columns: rename_map['age'] = 'age_wd_cooling'

# Check Teff naming convention
if 'teff_gspphot' not in df.columns:
    if 'teff_val' in df.columns: rename_map['teff_val'] = 'teff_gspphot'
    elif 'teff' in df.columns: rename_map['teff'] = 'teff_gspphot'

if rename_map:
    print(f"   -> Renaming columns: {rename_map}")
    df.rename(columns=rename_map, inplace=True)

# Ensure target columns exist
for col in ['mass_wd', 'age_wd_cooling']:
    if col not in df.columns: df[col] = np.nan

# ==============================================================================
# B. FEATURE ENGINEERING & PHYSICS FILTERS
# ==============================================================================
print("\nCalculating Photometric, Astrometric & Kinematic Features...")

# 1. Target Consolidation
if 'mass_final' not in df.columns: df['mass_final'] = np.nan
if 'age_final' not in df.columns: df['age_final'] = np.nan

# Merge Targets (WD + FLAME)
df.loc[wd_mask, 'mass_final'] = df.loc[wd_mask, 'mass_wd']
df.loc[wd_mask, 'age_final'] = df.loc[wd_mask, 'age_wd_cooling']
if 'mass_flame' in df.columns: df['mass_final'] = df['mass_final'].fillna(df['mass_flame'])
if 'age_flame' in df.columns: df['age_final'] = df['age_final'].fillna(df['age_flame'])

# 2. Log Transform Targets (Positive values only)
df['log_mass'] = np.where(df['mass_final'] > 0, np.log10(df['mass_final']), np.nan)
df['log_age'] = np.where(df['age_final'] > 0, np.log10(df['age_final']), np.nan)

# 3. Input Features (Vectorized)
# Color Indices
df['bp_rp0'] = df['phot_bp_mean_mag'] - df['phot_rp_mean_mag']
df['bp_g'] = df['phot_bp_mean_mag'] - df['phot_g_mean_mag']
df['g_rp'] = df['phot_g_mean_mag'] - df['phot_rp_mean_mag']

# --- PARALLAX FILTER ---
print("   -> Removing negative/zero parallax...")
mask_valid_parallax = df['parallax'] > 0.0
df = df[mask_valid_parallax].copy()

# 4. Calculate Absolute Magnitude (Distance Modulus)
# Formula: M = m - 5 * log10(d) + 5
df['distance_pc'] = 1000.0 / df['parallax']
df['abs_G0'] = df['phot_g_mean_mag'] - 5 * np.log10(df['distance_pc']) + 5

# --- [KINEMATIC FEATURES] ---
if 'pmra' in df.columns and 'pmdec' in df.columns:
    df['pm_total'] = np.sqrt(df['pmra']**2 + df['pmdec']**2)
    # Tangential Velocity equation: v_tan = 4.74 * mu / p
    df['v_tan'] = 4.74 * df['pm_total'] / df['parallax']
    # Log transform for better distribution (+1 to avoid log(0))
    df['log_v_tan'] = np.log10(df['v_tan'] + 1)
else:
    print("Warning: PMRA/PMDEC columns incomplete. Skipping kinematics.")
    df['log_v_tan'] = np.nan

# Galactic Coordinate Normalization
df[['l', 'b']] = df[['l', 'b']].fillna(0)
df['l_norm'] = df['l'] / 360.0
df['b_norm'] = (df['b'] + 90) / 180.0

# 5. Evolutionary Phase Labeling (Standardized to English)
print("Labeling Evolutionary Phases...")
df['evolutionary_phase'] = 'Other'

# Priority 1: Source-based labeling
df.loc[df['dataset_source'].str.contains('Scientific|Supergiant', case=False, na=False), 'evolutionary_phase'] = 'Supergiant'
df.loc[df['dataset_source'].str.contains('Gentile|WD', case=False, na=False), 'evolutionary_phase'] = 'White Dwarf'
df.loc[df['dataset_source'].str.contains('Giant_Query', case=False, na=False), 'evolutionary_phase'] = 'Red Giant'

# Priority 2: FLAME-based labeling
mask_target = df['evolutionary_phase'] == 'Other'
if 'evolstage_flame' in df.columns:
    df.loc[mask_target & df['evolstage_flame'].between(100, 419), 'evolutionary_phase'] = 'Main Sequence'
    df.loc[mask_target & df['evolstage_flame'].between(420, 489), 'evolutionary_phase'] = 'Sub-Giant'
    df.loc[mask_target & (df['evolstage_flame'] >= 490), 'evolutionary_phase'] = 'Red Giant'

# Encoding Labels to Integers for Neural Network
label_map = {'Main Sequence': 0, 'Sub-Giant': 1, 'Red Giant': 2, 'Supergiant': 3, 'White Dwarf': 4}
df['label_code'] = df['evolutionary_phase'].map(label_map)

# ==============================================================================
# C. DATASET SPLITTING (MAJOR REVISION)
# ==============================================================================

# 1. Mandatory Features (Pure Photometry)
base_feats = [
    'bp_rp0', 'bp_g', 'g_rp',
    'abs_G0', 'parallax', 'ruwe',
    'l_norm', 'b_norm'
]

# 2. Additional Features (Regression Only)
# [REVISION] Added 'teff_gspphot' to ensure it persists through filtering
reg_feats = ['mh_gspphot', 'log_v_tan', 'teff_gspphot']

# Validate feature existence
available_reg_feats = [c for c in reg_feats if c in df.columns]
if len(available_reg_feats) < len(reg_feats):
    print(f"Warning: Some regression features are missing: {set(reg_feats) - set(available_reg_feats)}")
    reg_feats = available_reg_feats

# Collect all columns to persist
cols_to_keep = list(set(base_feats + reg_feats + ['label_code', 'log_mass', 'log_age', 'dataset_source']))

# Final Cleaning (Based on Basic Photometry presence)
df_clean = df.dropna(subset=base_feats + ['label_code'])[cols_to_keep].copy()
df_clean['label_code'] = df_clean['label_code'].astype('int8')

del df
gc.collect()

print(f"\nTotal Clean Data (Basic Photometry): {len(df_clean):,} rows")

# --- A. CLASSIFICATION DATASET (PURE PHOTOMETRY) ---
print("\n--- A. Creating CLASSIFICATION Dataset ---")
frames_cls = []
TARGET_MAX = 20000

for label_code in [0, 1, 2, 3, 4]:
    group = df_clean[df_clean['label_code'] == label_code]
    if len(group) > TARGET_MAX:
        frames_cls.append(group.sample(n=TARGET_MAX, random_state=RANDOM_STATE))
    else:
        frames_cls.append(group)

df_classification = pd.concat(frames_cls).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df_classification.to_parquet("df_cls_final.parquet", index=False)
print(f"Classification Data Ready: {len(df_classification):,} rows.")

del df_classification, frames_cls, group
gc.collect()

# --- B. REGRESSION DATASET (HYBRID: PHOTOMETRY + KINEMATICS + METALLICITY + TEFF) ---
print("\n--- B. Creating REGRESSION Dataset ---")
target_classes = [0, 1, 2] # Main Sequence, Sub-Giant, Red Giant

# [REVISION] Specific Filter for Regression: Must have Teff, Metallicity & Kinematics
reg_conditions = (
    (df_clean['label_code'].isin(target_classes)) &
    (np.isfinite(df_clean['log_mass'])) &
    (np.isfinite(df_clean['log_age']))
)

# Dynamically add conditions for available features
if 'mh_gspphot' in df_clean.columns:
    reg_conditions &= np.isfinite(df_clean['mh_gspphot'])
if 'log_v_tan' in df_clean.columns:
    reg_conditions &= np.isfinite(df_clean['log_v_tan'])
if 'teff_gspphot' in df_clean.columns: # [IMPORTANT] Filter NaN for Teff
    reg_conditions &= np.isfinite(df_clean['teff_gspphot'])

df_reg = df_clean[reg_conditions].copy()

# Limit Regression Data
MAX_REG = 1000000
if len(df_reg) > MAX_REG:
    print(f"   Sampling Regression Data to {MAX_REG} (from total {len(df_reg):,})...")
    df_reg = df_reg.sample(n=MAX_REG, random_state=RANDOM_STATE)

df_reg.to_parquet("df_reg_flame_final.parquet", index=False)
print(f"Regression Data Ready: {len(df_reg):,} rows (Teff included).")

del df_reg, df_clean
gc.collect()

print("\nDone. Missing column issues have been resolved.")

In [None]:
# 10. MASTER DATASET VISUALIZATION THIS SECTION IS OPTIONAL, RUN IT IF YOU WANT
# ==============================================================================
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from astropy.coordinates import SkyCoord
from astropy import units as u

# --- CONFIGURATION: JOURNAL AESTHETICS ---
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['axes.linewidth'] = 1.2
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.style.use('seaborn-v0_8-white') # Clean white background

# 1. LOAD DATA
if 'df_gabungan' in locals():
    df = df_gabungan.copy()
else:
    try:
        df = pd.read_csv("FINAL_MASTER_DATASET.csv")
    except:
        print("Error: Dataset not found.")
        df = pd.DataFrame()

if not df.empty:
    print(f"Analyzing {len(df):,} Stars...")

    # 2. STANDARDIZE LABELS TO ENGLISH
    # This ensures consistency with previous steps and the color palette
    label_translation = {
        'Deret Utama': 'Main Sequence',
        'Sub-Raksasa': 'Sub-Giant',
        'Raksasa': 'Red Giant',
        'Maharaksasa': 'Supergiant',
        'Katai Putih': 'White Dwarf'
    }

    target_col = 'fase_evolusi_final'
    if target_col in df.columns:
        df['evolutionary_phase'] = df[target_col].replace(label_translation)
    else:
        # Fallback if already renamed in previous steps
        df['evolutionary_phase'] = df['evolutionary_phase'] if 'evolutionary_phase' in df.columns else 'Unknown'

    # 3. SETUP PALETTE
    palette_dict = {
        'Main Sequence': 'tab:blue',
        'Sub-Giant': 'tab:orange',
        'Red Giant': 'tab:red',
        'Supergiant': 'tab:purple',
        'White Dwarf': 'tab:cyan'
    }
    hue_order = ['White Dwarf', 'Main Sequence', 'Sub-Giant', 'Red Giant', 'Supergiant']

    # 4. INITIALIZE GRID
    fig = plt.figure(figsize=(20, 18))
    gs = fig.add_gridspec(3, 3)

    # ------------------------------------------------------------------------------
    # PANEL 1: HERTZSPRUNG-RUSSELL (HR) DIAGRAM
    # ------------------------------------------------------------------------------
    ax1 = fig.add_subplot(gs[0, :2]) # Spans 2 columns

    # Sampling for performance
    sample_hr = df.groupby('evolutionary_phase').apply(lambda x: x.sample(min(len(x), 10000))).reset_index(drop=True)

    sns.scatterplot(
        data=sample_hr, x='bp_rp0', y='abs_G0',
        hue='evolutionary_phase', hue_order=hue_order, palette=palette_dict,
        s=15, alpha=0.6, edgecolor='none', ax=ax1
    )
    ax1.invert_yaxis()
    ax1.set_title("Hertzsprung-Russell (HR) Diagram - Master Dataset", fontweight='bold')
    ax1.set_xlabel("Color Index ($G_{BP} - G_{RP}$)")
    ax1.set_ylabel("Absolute Magnitude ($M_G$)")
    ax1.legend(loc='upper right', title="Evolutionary Phase", frameon=True)
    ax1.grid(False)

    # ------------------------------------------------------------------------------
    # PANEL 2: CLASS DISTRIBUTION (BAR CHART)
    # ------------------------------------------------------------------------------
    ax2 = fig.add_subplot(gs[0, 2])
    class_counts = df['evolutionary_phase'].value_counts()

    sns.barplot(x=class_counts.values, y=class_counts.index, palette=palette_dict, ax=ax2, edgecolor='none')
    ax2.set_xscale('log')
    ax2.set_title("Class Imbalance (Log Scale)", fontweight='bold')
    ax2.set_xlabel("Number of Stars (Log)")
    ax2.bar_label(ax2.containers[0], fmt='%.0f', padding=3)
    ax2.grid(False)

    # ------------------------------------------------------------------------------
    # PANEL 3: MASS DISTRIBUTION
    # ------------------------------------------------------------------------------
    ax3 = fig.add_subplot(gs[1, 0])
    df_mass = df[df['mass_final'] > 0]

    sns.histplot(
        data=df_mass, x='mass_final', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", log_scale=True, common_norm=False, ax=ax3, legend=False
    )
    ax3.set_title("Mass Distribution ($M_{\odot}$)", fontweight='bold')
    ax3.set_xlabel("Mass ($M_{\odot}$)")
    ax3.grid(False)

    # ------------------------------------------------------------------------------
    # PANEL 4: AGE DISTRIBUTION
    # ------------------------------------------------------------------------------
    ax4 = fig.add_subplot(gs[1, 1])
    df_age = df[df['age_final'] > 0.001]

    sns.histplot(
        data=df_age, x='age_final', hue='evolutionary_phase',
        palette=palette_dict, hue_order=hue_order,
        element="step", log_scale=True, common_norm=False, ax=ax4, legend=False
    )
    ax4.set_title("Age Distribution (Gyr)", fontweight='bold')
    ax4.set_xlabel("Age (Gyr)")
    ax4.grid(False)

    # ------------------------------------------------------------------------------
    # PANEL 5: GALACTIC MAP (AITOFF PROJECTION)
    # ------------------------------------------------------------------------------
    ax5 = fig.add_subplot(gs[1, 2], projection='aitoff')

    # Coordinate Conversion
    sample_map = df.sample(min(len(df), 5000), random_state=42)
    c = SkyCoord(ra=sample_map['ra'].values*u.degree, dec=sample_map['dec'].values*u.degree, frame='icrs')
    l_rad = c.galactic.l.wrap_at(180*u.deg).radian
    b_rad = c.galactic.b.radian

    sc = ax5.scatter(l_rad, b_rad, c=sample_map['mass_final'], cmap='magma', s=5, alpha=0.7)
    ax5.set_title("Spatial Distribution (Galactic Map)", fontweight='bold')
    ax5.grid(True, alpha=0.3) # Subtle grid for map context

    # Colorbar
    cb = plt.colorbar(sc, ax=ax5, orientation='horizontal', shrink=0.6, pad=0.1)
    cb.set_label("Mass ($M_{\odot}$)")
    cb.outline.set_linewidth(0)

    # ------------------------------------------------------------------------------
    # PANEL 6: FEATURE CORRELATION MATRIX
    # ------------------------------------------------------------------------------
    ax6 = fig.add_subplot(gs[2, :])
    features = ['phot_g_mean_mag', 'phot_bp_mean_mag', 'phot_rp_mean_mag',
                'bp_rp0', 'parallax', 'ruwe', 'abs_G0', 'mass_final', 'age_final']

    # Check if columns exist
    available_features = [f for f in features if f in df.columns]
    corr = df[available_features].corr()

    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(
        corr, mask=mask, annot=True, fmt=".2f",
        cmap='coolwarm', vmin=-1, vmax=1, ax=ax6,
        linewidths=0.5
    )
    ax6.set_title("Physical Feature Correlation Matrix", fontweight='bold')

    plt.tight_layout()
    plt.savefig("Master_Dataset_EDA_Scientific.png", dpi=300)
    plt.show()

else:
    print("Error: Data Empty.")