In [None]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("manufacturing_dataset_1000.csv")

# Step 2: Normalize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 3: Display column names and first few rows
print("📋 Available columns in your dataset:")
print(df.columns.tolist())
print("\n🔹 Data preview:")
print(df.head(3).to_string())

# Step 4: Automatically detect a suitable cluster column (categorical or few unique values)
categorical_cols = [col for col in df.columns if df[col].nunique() < 20 and df[col].dtype == "object"]

if not categorical_cols:
    raise KeyError("❌ No suitable categorical column found for cluster sampling. Please choose one manually.")

# Pick the first detected categorical column as the cluster variable
cluster_col = categorical_cols[0]
print(f"\n✅ Using '{cluster_col}' as the cluster column for sampling.")

# Step 5: Identify all clusters
clusters = df[cluster_col].unique()
print(f"Found {len(clusters)} unique clusters in '{cluster_col}'.")

# Step 6: Randomly select 3 clusters (or fewer if limited)
n_clusters = min(3, len(clusters))
sampled_clusters = pd.Series(clusters).sample(n=n_clusters, random_state=42)
print(f"\n🎯 Selected clusters: {list(sampled_clusters)}")

# Step 7: Take all rows belonging to the sampled clusters
cluster_sample = df[df[cluster_col].isin(sampled_clusters)]

# Step 8: Save sampled dataset
cluster_sample.to_csv("cluster_sample.csv", index=False)
print("\n✅ Cluster sample created and saved as 'cluster_sample.csv'")

# Step 9: Visualize using pandas built-in plotting
print("\n📊 Displaying cluster distribution plots...")
df[cluster_col].value_counts().plot(kind="bar", title="Original Cluster Distribution")
cluster_sample[cluster_col].value_counts().plot(kind="bar", title="Sampled Cluster Distribution")


Available columns:
['timestamp', 'machine_id', 'product_code', 'operator_id', 'shift', 'material_batch', 'ambient_temp_c', 'humidity_pct', 'setpoint_temp_c', 'setpoint_pressure_bar', 'spindle_speed_rpm', 'vibration_mm_s', 'torque_nm', 'energy_kwh', 'target_thickness_mm', 'measured_thickness_mm', 'quality_score', 'defect']


KeyError: "Column 'department' not found. Please change 'cluster_col' to an existing column."