In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 1. Load dataset
df = pd.read_csv('sku_dataset.csv')  # Replace with your CSV file

# 2. Select numeric features for clustering
features = ['Unitprice', 'Outbound number', 'Total outbound', 'Pal grossweight', 'Pal height', 'Units per pal']
X = df[features]

# 3. Standardize features (important for KMeans!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Run KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# 5. Show cluster centers
print("Cluster Centers:\n", kmeans.cluster_centers_)

# 6. Visualize clusters (on first 2 features)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel('Unitprice (scaled)')
plt.ylabel('Outbound number (scaled)')
plt.title('KMeans Clustering on SKU Data')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load dataset
df = pd.read_csv('sku_dataset.csv')

# 2. Select numeric features
features = ['Unitprice', 'Outbound number', 'Total outbound', 'Pal grossweight', 'Pal height', 'Units per pal']
X = df[features].values

# 3. Normalize features
X = (X - X.mean(axis=0)) / X.std(axis=0)

# 4. Initialize parameters
k = 3
np.random.seed(42)
centroids = X[np.random.choice(X.shape[0], k, replace=False)]

# 5. Run KMeans loop
for iteration in range(100):
    # Assign points to nearest centroid
    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    labels = np.argmin(distances, axis=1)
    
    # Calculate new centroids
    new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    
    # Check convergence
    if np.all(centroids == new_centroids):
        break
    
    centroids = new_centroids

# 6. Add cluster labels to DataFrame
df['Cluster'] = labels

# 7. Plot result (first 2 features)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
plt.xlabel('Unitprice (scaled)')
plt.ylabel('Outbound number (scaled)')
plt.title('KMeans Clustering (from scratch) on SKU Data')
plt.show()
