In [None]:
import re
import random
import nltk
import requests
from nltk.corpus import stopwords
import pandas as pd


# Download and set stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing Function
def preprocess_tweet(tweet):
    # Extract content from tweet
    tweet_content = tweet.split('|')[2]

    # Remove @mentions, hashtags, URLs, and convert to lowercase
    tweet_content = re.sub(r'@\w+', '', tweet_content)
    tweet_content = tweet_content.replace('#', '')
    tweet_content = re.sub(r'http\S+|www\S+|https\S+', '', tweet_content, flags=re.MULTILINE)
    tweet_content = tweet_content.lower()

    # Remove stopwords
    tweet_content = ' '.join([word for word in tweet_content.split() if word not in stop_words])

    return tweet_content

# Jaccard Distance Function
def jaccard_distance(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return 1 - intersection / union

# K-means Clustering Function
def kmeans_clustering(tweets, k):
    # Initialize centroids randomly
    centroids = random.sample(tweets, k)
    clusters = None

    while True:
        new_clusters = [[] for _ in range(k)]
        for tweet in tweets:
            distances = [jaccard_distance(set(tweet.split()), set(centroid.split())) for centroid in centroids]
            closest_centroid = distances.index(min(distances))
            new_clusters[closest_centroid].append(tweet)

        new_centroids = []
        for cluster in new_clusters:
            if len(cluster) == 0:
                # Reinitialize centroid if cluster is empty
                new_centroids.append(random.choice(tweets))
            else:
                # Update centroid based on current cluster
                new_centroids.append(' '.join(set(word for tweet in cluster for word in tweet.split())))

        if new_centroids == centroids:
            break
        centroids = new_centroids
        clusters = new_clusters

    return clusters, centroids

# Sum of Squared Errors Calculation
def calculate_sse(clusters, centroids):
    sse = 0
    for i, cluster in enumerate(clusters):
        centroid_set = set(centroids[i].split())
        for tweet in cluster:
            sse += jaccard_distance(set(tweet.split()), centroid_set) ** 2
    return sse

# Load and Preprocess Tweets
file_url = 'https://raw.githubusercontent.com/sreeharsha5219/ML_Assignment/main/usnewshealth.txt'
response = requests.get(file_url)

if response.status_code == 200:
    tweets = [preprocess_tweet(line.strip()) for line in response.text.split('\n') if line.strip()]
else:
    print("Failed to retrieve the file. Status code:", response.status_code)
    tweets = []

sse_values = []
cluster_sizes_all_k = []

# Clustering and Results
k_values = [5, 10,15,20,25,30]  # Example K values
results = []
for k in k_values:
    clusters, centroids = kmeans_clustering(tweets, k)
    sse = calculate_sse(clusters, centroids)
    cluster_sizes = [len(cluster) for cluster in clusters]
    sse_values.append(sse)
    cluster_sizes_all_k.append(cluster_sizes)
    results.append({'Value of K': k, 'SSE': sse, 'Size of each cluster': ', '.join(f'{i+1}: {size} tweets' for i, size in enumerate(cluster_sizes))})

df = pd.DataFrame(results)
print(df.to_string(index=False))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
ow

In [None]:
df.to_csv('results.csv')


In [None]:
import matplotlib.pyplot as plt


# Elbow Plot
plt.figure(figsize=(10, 6))
plt.plot(k_values, sse_values, marker='o')
plt.title('Elbow Plot for K-Means Clustering')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.grid(True)
plt.show()

# Cluster Size Distribution for each k
for i, k in enumerate(k_values):
    plt.figure(figsize=(20, 20))
    plt.bar(range(1, k+1), cluster_sizes_all_k[i])
    plt.title(f'Cluster Size Distribution for K = {k}')
    plt.xlabel('Cluster Number')
    plt.ylabel('Number of Tweets in Cluster')
    plt.xticks(range(1, k+1))
    plt.grid(True)
    plt.show()