In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score

In [4]:
# Step 1: Generate Random Data
def generate_data(n_samples=300, n_features=2):
    np.random.seed(42)
    data = np.random.rand(n_samples, n_features)
    return data

# Step 2: Preprocessing
def preprocess_data(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler

# Step 3: Train-Test Split
def split_data(data, test_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    return train_data, test_data

# Step 4: Train Base Model (Mean Shift)
def train_meanshift(train_data):
    bandwidth = estimate_bandwidth(train_data, quantile=0.2, n_samples=100)
    model = MeanShift(bandwidth=bandwidth)
    model.fit(train_data)
    return model

# Step 5: Evaluate Model (Fixed: Use Training Data)
def evaluate_model(model, train_data):
    labels = model.labels_
    silhouette = silhouette_score(train_data, labels)
    return labels, silhouette

# Step 6: Predict New Data
def predict_new_data(model, scaler, n_samples=5):
    new_data = np.random.rand(n_samples, 2)  # Generate random new data
    new_data_scaled = scaler.transform(new_data)

    # Assign to nearest cluster center
    cluster_centers = model.cluster_centers_
    distances = np.linalg.norm(new_data_scaled[:, np.newaxis] - cluster_centers, axis=2)
    predictions = np.argmin(distances, axis=1)

    return new_data, predictions

In [5]:
# Run the Workflow
data = generate_data()
scaled_data, scaler = preprocess_data(data)
train_data, test_data = split_data(scaled_data)

model = train_meanshift(train_data)
labels, silhouette = evaluate_model(model, train_data)  # FIXED: Use train_data

new_data, predictions = predict_new_data(model, scaler)

# Print Results
print(f"Number of clusters found: {len(np.unique(labels))}")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"New Data Predictions: {predictions}")

Number of clusters found: 4
Silhouette Score: 0.4127
New Data Predictions: [3 3 3 3 1]
