In [1]:
import re
import pandas as pd
import numpy as np
import random


#DONE
def data_cleaning(text):
    # Split the text into words and process each word
    words = text.split()
    cleaned_words = [
        re.sub(r"^#(.+)", lambda m: m.group(1).lower(), word) if word.startswith('#')
        else word.lower().strip("'")
        for word in words if not word.startswith('@') and not word.startswith('http://')
    ]
    return cleaned_words

#DONE
def jaccard_distance(set1,set2):
    set1 = set(set1)
    set2 = set(set2)
    # intersection of two sets
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
    return 1 - intersection / union

#DONE
def preprocess_data(source_file):
    data_frame = pd.read_csv(source_file, encoding="ISO-8859-1", header=None, sep="|")

    # Randomly shuffle the dataframe rows
    shuffled_data = data_frame.sample(frac=1).reset_index(drop=True)

    # Retain only the relevant column (assuming the third column contains the tweet text)
    tweets = shuffled_data.iloc[:, 2]

    cleaned_tweets = tweets.apply(data_cleaning)
    final_data = cleaned_tweets.dropna()

    return final_data


def perform_kmeans(tweet_ds,K,centroids=None):
    tweet_ds = tweet_ds.sample(frac=1).reset_index(drop=True)
    #Initializing the Centroids for the the first Iteration
    if centroids==None:
        centroids = {}
        for i in range(K):
            if(tweet_ds[i] not in list(centroids.keys())):
                centroids[i] = tweet_ds[i]
                #print("centroid {} is {}".format(i,centroids[i]))
    tweet_cluster = {i:[] for i in range(K)}
    #Assignment step : Clustering the tweets to the centroids
    for tweet in tweet_ds:
        tweet_distance = [jaccard_distance(tweet,centroids[c]) for c in centroids]
        min_distance = tweet_distance.index(min(tweet_distance))
        tweet_cluster[min_distance].append(tweet)
    new_centroid = update_centroid(tweet_cluster,K)
    converge = False
    centroids_tweet = list(centroids.values())
    new_centroids_tweet = list(new_centroid.values())
    #Converging check - Check if the old_centroid and updated new centroids are equal
    for i in range(K):
        if(centroids_tweet[i] != new_centroids_tweet[i]):
            converge = False
            break
        else:
            converge = True
    if converge == False:
        print("Not Converged...Recomputing the Centroid")
        centroids = new_centroid.copy()
        perform_kmeans(tweet_ds,K,centroids)
    else:
        print("Converge Succeed")
        sse_total = compute_ss_error(tweet_cluster,centroids)
        print("\nThe Sum of Squared Error is ",sse_total)
        #for i in range(K):
            #print("\nThe number of tweets in the cluster {0} is {1} ".format(i+1,len(tweet_cluster[i])))

            #print("\n{0} : {1} ".format(i+1,len(tweet_cluster[i])))

#DONE
def update_centroid(tweet_cluster, K):
    updated_centroid = {}

    for cluster_id, cluster_tweets in tweet_cluster.items():
        if not cluster_tweets:  # Skip empty clusters
            continue

        inter_cluster_dist = []

        for tweet in cluster_tweets:
            tweet_distance = [jaccard_distance(tweet, c) for c in cluster_tweets]
            inter_total_dist = sum(tweet_distance)
            inter_cluster_dist.append(inter_total_dist)

        cluster_tweet_index = inter_cluster_dist.index(min(inter_cluster_dist))
        updated_centroid[cluster_id] = cluster_tweets[cluster_tweet_index]

    return updated_centroid

#DONE
def compute_ss_error(tweet_cluster, centroids):
    return sum(jaccard_distance(centroids[centroid_id], tweet)**2
               for centroid_id, tweets in tweet_cluster.items()
               for tweet in tweets)

#INPUTS
dataset = "/content/sample_data/foxnewshealth.txt"
K_List = [15, 20, 25, 30, 70] #cluster sizes

#Processing the dataset and removing the delimiters
f = open(dataset,"r+", encoding="ISO-8859-1")
line =f.readlines()
f.truncate(0)
f.seek(0)
for i in range (0,len(line)):
    if(line[i].count('|') != 2):
        line_split = line[i].split('|')
        new_line = '|'.join(line_split[:3]) + ' ' + ' '.join(line_split[3:])
        line[i] = new_line
for j in range(0,len(line)):
    l = line[j].strip()
    f.write(line[j])
f.close()
cleaned_data = preprocess_data(dataset)

for K in K_List:
  perform_kmeans(cleaned_data,K,centroids = None)

KeyError: 1