In [34]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


In [35]:
data = pd.read_csv("segmentation data.csv")
data.columns


Index(['ID', 'Sex', 'Marital status', 'Age', 'Education', 'Income',
       'Occupation', 'Settlement size'],
      dtype='object')

In [36]:
data = data.drop(['ID', 'Sex', 'Marital status', 'Education', 'Occupation'], axis=1)


In [37]:
print(data.isnull().sum())

Age                0
Income             0
Settlement size    0
dtype: int64


In [38]:
X = data.values
desired_number_of_clusters = 4  # You can adjust this value as per your requirements
kmeans = KMeans(n_clusters=desired_number_of_clusters)
kmeans.fit(X)




In [39]:
labels = kmeans.predict(X)


In [40]:
import numpy as np

def kmeans_scratch(X, n_clusters, max_iterations=100):
    centroids = X[np.random.choice(range(len(X)), size=n_clusters, replace=False)]
    
    for _ in range(max_iterations):
        labels = np.argmin(np.linalg.norm(X[:, np.newaxis] - centroids, axis=-1), axis=-1)
        
        new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(n_clusters)])
        
        if np.all(centroids == new_centroids):
            break
        
        centroids = new_centroids
    
    return labels


In [41]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

eps_values = [0.1, 0.2, 0.5, 1, 2]
min_samples_values = [3, 4, 5, 6]
best_eps = None
best_min_samples = None
best_score = -1

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan.fit(X)
        labels = dbscan.labels_
        unique_labels = np.unique(labels)
        
        if len(unique_labels) == 1:
            continue
        
        score = silhouette_score(X, labels)
        
        if score > best_score:
            best_score = score
            best_eps = eps
            best_min_samples = min_samples

print("Best epsilon:", best_eps)
print("Best min_samples:", best_min_samples)


Best epsilon: 2
Best min_samples: 3


In [42]:
import numpy as np

def calculate_distance(X1, X2):
    return np.sqrt(np.sum((X1 - X2) ** 2, axis=-1))

def dbscan_scratch(X, eps, min_samples):
    n_samples = X.shape[0]
    labels = np.zeros(n_samples, dtype=int)
    cluster_id = 0
    
    for i in range(n_samples):
        if labels[i] != 0:
            continue
        
        neighbors = np.where(calculate_distance(X[i], X) <= eps)[0]
        
        if len(neighbors) < min_samples:
            labels[i] = -1  # Noise point
        else:
            cluster_id += 1
            labels[i] = cluster_id
            
            for neighbor in neighbors:
                if labels[neighbor] == -1:
                    labels[neighbor] = cluster_id
                elif labels[neighbor] == 0:
                    labels[neighbor] = cluster_id
    
    return labels
