### Text Clustering

In [1]:

documents = [
    "I love programming in Python",
    "Python and Java are popular programming languages",
    "I enjoy watching movies and series",
    "Cinema and film industry is booming",
    "Machine learning and AI are future tech",
    "Music concerts are fun"
]

In [2]:
import numpy as np
import re
from collections import Counter

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return tokens

docs_tokens = [preprocess(doc) for doc in documents]

# Building vocabulary
vocab = sorted(list(set([w for doc in docs_tokens for w in doc])))

# Vectorizing documents
def vectorize(doc, vocab):
    vec = np.zeros(len(vocab))
    count = Counter(doc)
    for i, word in enumerate(vocab):
        vec[i] = count[word]
    return vec

doc_vectors = np.array([vectorize(doc, vocab) for doc in docs_tokens])

#### K-means Clustering

In [3]:
def kmeans(X, k=2, max_iters=100):
    n_samples, n_features = X.shape
    # Initializing centroids randomly
    np.random.seed(0)
    centroids = X[np.random.choice(n_samples, k, replace=False)]
    
    for _ in range(max_iters):
        clusters = [[] for _ in range(k)]
        # Assigning clusters
        for idx, x in enumerate(X):
            distances = [np.linalg.norm(x - c) for c in centroids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(idx)
        # Updating centroids
        new_centroids = np.zeros_like(centroids)
        for i, cluster in enumerate(clusters):
            if cluster:
                new_centroids[i] = np.mean(X[cluster], axis=0)
            else:
                new_centroids[i] = centroids[i]  # if empty cluster
        # Checking convergence
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids

clusters_kmeans, centroids_kmeans = kmeans(doc_vectors, k=2)
print("K-Means Clusters:", clusters_kmeans)

K-Means Clusters: [[0, 1, 3, 4, 5], [2]]


#### K- Mediods

In [4]:
def kmedoids(X, k=2, max_iters=100):
    n_samples = X.shape[0]
    np.random.seed(0)
    medoid_idx = np.random.choice(n_samples, k, replace=False)
    
    for _ in range(max_iters):
        clusters = [[] for _ in range(k)]
        # Assign clusters based on distance to medoid
        for idx, x in enumerate(X):
            distances = [np.linalg.norm(x - X[m]) for m in medoid_idx]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(idx)
        # Update medoids
        new_medoids = medoid_idx.copy()
        for i, cluster in enumerate(clusters):
            if cluster:
                intra_distances = [sum(np.linalg.norm(X[p]-X[q]) for q in cluster) for p in cluster]
                new_medoids[i] = cluster[np.argmin(intra_distances)]
        if np.array_equal(new_medoids, medoid_idx):
            break
        medoid_idx = new_medoids
    return clusters, medoid_idx

clusters_kmedoid, medoids = kmedoids(doc_vectors, k=2)
print("K-Medoids Clusters:", clusters_kmedoid)


K-Medoids Clusters: [[0, 1, 3, 4, 5], [2]]


#### Text Shingling( Jaccard's Similarity)

In [5]:
def k_shingles(doc, k=2):
    tokens = doc
    shingles = set()
    for i in range(len(tokens)-k+1):
        shingles.add(tuple(tokens[i:i+k]))
    return shingles

shingle_sets = [k_shingles(doc, k=2) for doc in docs_tokens]

def jaccard_sim(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

# Example: compute similarity matrix
n = len(shingle_sets)
sim_matrix = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        sim_matrix[i,j] = jaccard_sim(shingle_sets[i], shingle_sets[j])

print("Jaccard Similarity Matrix (2-shingles):\n", np.round(sim_matrix, 2))

Jaccard Similarity Matrix (2-shingles):
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]
