In [1]:
import re
import pandas as pd
import numpy as np
import random

#DONE
def clean_data(text):
    # Split the text into words and process each word
    words = text.split()
    cleaned_words = [
        re.sub(r"^#(.+)", lambda m: m.group(1).lower(), word) if word.startswith('#')
        else word.lower().strip("'")
        for word in words if not word.startswith('@') and not word.startswith('http://')
    ]
    return cleaned_words

#DONE
def jaccard_distance(set1,set2):
    set1 = set(set1)
    set2 = set(set2)
    # intersection of two sets
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
    return 1 - intersection / union

#DONE
def preprocess_data(file):
    df = pd.read_csv(file, encoding="ISO-8859-1", header=None, sep="|")

    # Randomly shuffle the dataframe rows
    shuffled_data = df.sample(frac=1).reset_index(drop=True)

    # Retain only the relevant column (assuming the third column contains the tweet text)
    tweets = shuffled_data.iloc[:, 2]

    cleaned_tweets = tweets.apply(clean_data)
    final_data = cleaned_tweets.dropna()

    #print(final_data.head())

    return final_data


def perform_kmeans(tweet_ds, K, centroids):
    # Reshuffle the dataset for each function call
    tweet_ds = tweet_ds.sample(frac=1).reset_index(drop=True)

    # Initialize centroids if not provided
    if centroids is None:
        centroids = {}
        for i in range(K):
            if tweet_ds[i] not in centroids.values():
                centroids[i] = tweet_ds[i]

    # Create clusters and assign tweets
    tweet_cluster = {i: [] for i in range(K)}
    for tweet in tweet_ds:
        distances = [jaccard_distance(tweet, centroids[key]) for key in centroids]
        closest = distances.index(min(distances))
        tweet_cluster[closest].append(tweet)

    for cluster_id in range(K):
        print(f"Cluster {cluster_id} size: {len(tweet_cluster[cluster_id])}")
    # Calculate new centroids from the clusters
    new_centroid = update_centroid(tweet_cluster, K)
    centroids_values = list(centroids.values())
    new_centroids_values = list(new_centroid.values())

    # Check if centroids have changed
    convergence = all(centroids_values[i] == new_centroids_values[i] for i in range(K))

    # Recursive call if not converged, otherwise calculate SSE
    print("\nK = ", K)
    if not convergence:
        print("Processing.. ")
        perform_kmeans(tweet_ds, K, new_centroid.copy())
    else:
        print("Converged")
        sse_total = compute_ss_error(tweet_cluster, centroids)
        print("\nSSE = ", sse_total)



# DONE
def update_centroid(tweet_cluster, K):
    # Initialize a dictionary to store the new centroids for each cluster
    updated_centroid = {}

    # Iterate over each cluster and its associated tweets
    for cluster_id, cluster_tweets in tweet_cluster.items():
        # Skip the processing for any cluster that doesn't have any tweets
        if not cluster_tweets:
            continue

        # List to store the sum of distances between each tweet and every other tweet in the same cluster
        inter_cluster_dist = []

        # Calculate the pairwise Jaccard distance for each tweet within the cluster
        for tweet in cluster_tweets:
            # List comprehension to calculate the distance from one tweet to all others in the cluster
            tweet_distance = [jaccard_distance(tweet, c) for c in cluster_tweets]
            # Sum the distances to get the total distance of this tweet to all others
            inter_total_dist = sum(tweet_distance)
            # Append the total distance to the list
            inter_cluster_dist.append(inter_total_dist)

        # Find the tweet that has the minimum total distance to all other tweets (i.e., the most central tweet)
        cluster_tweet_index = inter_cluster_dist.index(min(inter_cluster_dist))
        # Update the centroid for this cluster to be the tweet with the minimum total distance
        updated_centroid[cluster_id] = cluster_tweets[cluster_tweet_index]

    # Return the dictionary of updated centroids
    return updated_centroid


#DONE
def compute_ss_error(tweet_cluster, centroids):
    return sum(jaccard_distance(centroids[centroid_id], tweet)**2
               for centroid_id, tweets in tweet_cluster.items()
               for tweet in tweets)

#INPUTS
dataset = "/content/sample_data/foxnewshealth.txt"
K_List = [3,4,5,6,7,7,8,9,10,15, 20, 25, 30, 70] #cluster sizes

#Processing the dataset and removing the delimiters
f = open(dataset,"r+", encoding="ISO-8859-1")
lines =f.readlines()
f.truncate(0)
f.seek(0)

#EXAMPLE
#585942799561928704|Wed Apr 08 23:10:24 +0000 2015|Injury prevention programs unpopular with high school coaches http://ow.ly/Lma9z

# Write each line back to the file after stripping whitespace
for i, line in enumerate(lines):
    clean_line = line.strip()
    f.write(clean_line + '\n')  # Ensure each line ends with a newline character

f.close()
cleaned_data = preprocess_data(dataset)

for K in K_List:
  perform_kmeans(cleaned_data,K,centroids = None)

Cluster 0 size: 1553
Cluster 1 size: 354
Cluster 2 size: 93

K =  3
Processing.. 
Cluster 0 size: 1152
Cluster 1 size: 464
Cluster 2 size: 384

K =  3
Converged

SSE =  1745.2606050503643
Cluster 0 size: 970
Cluster 1 size: 59
Cluster 2 size: 392
Cluster 3 size: 579

K =  4
Processing.. 
Cluster 0 size: 784
Cluster 1 size: 477
Cluster 2 size: 256
Cluster 3 size: 483

K =  4
Processing.. 
Cluster 0 size: 804
Cluster 1 size: 405
Cluster 2 size: 251
Cluster 3 size: 540

K =  4
Converged

SSE =  1741.46669819799
Cluster 0 size: 886
Cluster 1 size: 85
Cluster 2 size: 291
Cluster 3 size: 598
Cluster 4 size: 140

K =  5
Processing.. 
Cluster 0 size: 566
Cluster 1 size: 581
Cluster 2 size: 489
Cluster 3 size: 223
Cluster 4 size: 141

K =  5
Processing.. 
Cluster 0 size: 660
Cluster 1 size: 598
Cluster 2 size: 375
Cluster 3 size: 221
Cluster 4 size: 146

K =  5
Converged

SSE =  1713.4183722584212
Cluster 0 size: 804
Cluster 1 size: 231
Cluster 2 size: 358
Cluster 3 size: 379
Cluster 4 size: 14

# Twitter K-Means Clustering

This project implements the K-means clustering algorithm to analyze and cluster tweets based on their textual content using the Jaccard distance metric.

## Prerequisites

Before you run this code, make sure you have Python installed on your machine. This code has been tested on Python 3.8 and above.

### Dependencies

You need to install the following packages:
- `pandas`
- `numpy`
- `re` (Note: `re` is a built-in library, so no installation is necessary.)

You can install the necessary packages using pip:

```bash
pip install pandas numpy


### **How to run**

python kmeans_clustering.py
