In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
df = pd.read_csv('/content/weather.nominal.csv')
df.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [None]:
data = df.drop(columns=["play"]).copy()
data["windy"] = data["windy"].astype(int)
data.head()

Unnamed: 0,outlook,temperature,humidity,windy
0,sunny,hot,high,0
1,sunny,hot,high,1
2,overcast,hot,high,0
3,rainy,mild,high,0
4,rainy,cool,normal,0


Here,I dropped play column because for unsupervised learning we don't need target variable so we don't use labels.Convert windy into string so all the columns are treated as categorical text.

In [None]:
#Encode the categorical variabels

unique_values = {col : data[col].unique().tolist() for col in data.columns}
encoded_data = data.copy()
for col in data.columns:
  mapping = {val : i for i, val in enumerate(unique_values[col])}
  encoded_data[col] = encoded_data[col].map(mapping)

X = encoded_data.values
encoded_data.head()

Unnamed: 0,outlook,temperature,humidity,windy
0,0,0,0,0
1,0,0,0,1
2,1,0,0,0
3,2,1,0,0
4,2,2,1,0


This step allows us to do computing Hamming distances as well as running k-mode clustering and creating plots & math operations to converts our dataset from categorical text values into numerical.

In [None]:
def hamming_distance(row, centroid):
  return np.sum(row != centroid)

After converting dataset simply apply hamming_distances how this function works on data suppose there are row and centroid row = [0, 0, 1, 0], centroid = [0, 0, 0, 0] so row!=centroid compares each element column by column and returns a boolean array meaning in this data third element is mismatch distance between points is 1 if it is match consider as 0.


In [None]:
def calculate_mode(cluster_points):
  mode = []
  for col in range(cluster_points.shape[1]):
    values,counts = np.unique(cluster_points[:,col], return_counts=True)
    mode.append(values[np.argmax[counts]])
  return np.array(mode)

After finding distance between two points now we are calculate_mode.For each column in the cluster,find the most frequent value this mode becomes the new centroid of that cluster. array([0, 1, 0, 1])

For outlook -> most common value = 0 (sunny)

For temperature -> most common value = 1 (mild)

For humidity -> most common value = 0 (high)

For windy -> most common value = 1 (True)

This becomes the updated cluster centroid in the next K-Modes iteration.

In [None]:
def k_modes(X, k=2, max_iter=100):
    # Initialize random centroids
    indices = random.sample(range(X.shape[0]), k)
    centroids = X[indices, :]

    for _ in range(max_iter):
        # Assign clusters based on hamming distance
        clusters = np.array([np.argmin([hamming_distance(row, c) for c in centroids]) for row in X])

        # Update centroids (mode of each cluster)
        new_centroids = []
        for cluster_id in range(k):
            cluster_points = X[clusters == cluster_id]
            if len(cluster_points) > 0:
                new_centroids.append(calculate_mode(cluster_points))
            else:
                new_centroids.append(centroids[cluster_id])  # keep old centroid if empty cluster
        new_centroids = np.array(new_centroids)

        if np.array_equal(centroids, new_centroids):
            break
        centroids = new_centroids

    return clusters, centroids

# Run K-Modes
clusters, centroids = k_modes(X, k=2)
data["cluster"] = clusters

print(data)
print("\nFinal Centroids (in encoded form):")
print(centroids)

     outlook temperature humidity  windy  cluster
0      sunny         hot     high  False        0
1      sunny         hot     high   True        0
2   overcast         hot     high  False        0
3      rainy        mild     high  False        0
4      rainy        cool   normal  False        1
5      rainy        cool   normal   True        1
6   overcast        cool   normal   True        1
7      sunny        mild     high  False        0
8      sunny        cool   normal  False        1
9      rainy        mild   normal  False        0
10     sunny        mild   normal   True        0
11  overcast        mild     high   True        0
12  overcast         hot   normal  False        1
13     rainy        mild     high   True        0

Final Centroids (in encoded form):
[[0 1 0 0]
 [1 2 1 0]]


The K-Modes algorithm grouped my weather data into two clusters based on categorical similarity. Each row now has a cluster label (0 or 1). The final centroids show the most common category in each column for that cluster, which acts as the cluster ‘profile’. You can decode the numbers back to their original categories to see what each cluster represents (e.g. mostly sunny/mild days vs rainy/cool days).Cluster 0 contains rows whose most common pattern is sunny, mild, high humidity, not windy. Cluster 1 contains rows whose most common pattern is overcast/rainy, cool, normal humidity, not windy.