### 🚀 Task 1: Load the Dataset

In [5]:
import pandas as pd

# Load the dataset
file_path = "NewsCategorizer.xlsx"

# Read the required columns from the Excel file
df = pd.read_excel(file_path, usecols=["category", "short_description"])

# Display first few rows to verify
df.head()


Unnamed: 0,category,short_description
0,WELLNESS,Resting is part of training. I've confirmed wh...
1,WELLNESS,Think of talking to yourself as a tool to coac...
2,WELLNESS,The clock is ticking for the United States to ...
3,WELLNESS,"If you want to be busy, keep trying to be perf..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a..."


### 🚀 Task 2: Preprocessing for Clustering

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    words = text.split()  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & Stopword removal
    
    return " ".join(words)  # Return cleaned text as a string

# Apply preprocessing to the short descriptions
df["processed_description"] = df["short_description"].apply(preprocess_text)

# Display the processed dataset
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rajubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,category,short_description,processed_description
0,WELLNESS,Resting is part of training. I've confirmed wh...,resting part training ive confirmed sort alrea...
1,WELLNESS,Think of talking to yourself as a tool to coac...,think talking tool coach challenge narrate exp...
2,WELLNESS,The clock is ticking for the United States to ...,clock ticking united state find cure team work...
3,WELLNESS,"If you want to be busy, keep trying to be perf...",want busy keep trying perfect want happy focus...
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...",first bad news soda bread corned beef beer hig...


### 🚀 Task 3: Train LDA Topic Model

In [9]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm

# Tokenize the preprocessed text
df["tokenized_text"] = df["processed_description"].apply(lambda x: x.split())

# Create dictionary and corpus
dictionary = Dictionary(df["tokenized_text"])
corpus = [dictionary.doc2bow(text) for text in df["tokenized_text"]]

# Train LDA model with K=10 topics, 200 iterations
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, iterations=200, passes=10)

# Display the top words for each topic
for i, topic in lda_model.show_topics(num_topics=10, formatted=False):
    print(f"Topic {i}: {[word for word, prob in topic]}")


Topic 0: ['he', 'twitter', 'want', 'check', 'baseball', 'win', 'video', 'guy', 'pretty', 'sure']
Topic 1: ['like', 'one', 'get', 'time', 'make', 'thing', 'way', 'dont', 'know', 'u']
Topic 2: ['game', 'former', 'team', 'final', 'police', 'kid', 'field', 'north', 'quarterback', 'corporate']
Topic 3: ['sport', 'player', 'nfl', 'state', 'fan', 'leader', 'president', 'said', 'country', 'government']
Topic 4: ['star', 'customer', 'sunday', 'baby', 'board', 'party', 'score', 'round', 'paid', 'weekend']
Topic 5: ['state', 'u', 'economy', 'american', 'people', 'bank', 'woman', 'world', 'organization', 'new']
Topic 6: ['football', 'business', 'need', 'bowl', 'play', 'super', 'worker', 'street', 'opportunity', 'role']
Topic 7: ['year', 'time', 'first', 'last', 'week', 'day', 'two', 'month', 'one', 'three']
Topic 8: ['new', 'city', 'york', 'parent', 'company', 'gold', 'third', 'region', 'facebook', 'model']
Topic 9: ['game', 'coach', 'night', 'olympic', 'nba', 'first', 'girl', 'hit', 'series', 'ce

### 🚀 Task 4: K-Means Clustering with TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed_description"])

# Apply K-Means clustering (K=10)
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
df["kmeans_cluster"] = kmeans.fit_predict(tfidf_matrix)

# Display clustered data
df.head()


Unnamed: 0,category,short_description,processed_description,tokenized_text,kmeans_cluster
0,WELLNESS,Resting is part of training. I've confirmed wh...,resting part training ive confirmed sort alrea...,"[resting, part, training, ive, confirmed, sort...",3
1,WELLNESS,Think of talking to yourself as a tool to coac...,think talking tool coach challenge narrate exp...,"[think, talking, tool, coach, challenge, narra...",7
2,WELLNESS,The clock is ticking for the United States to ...,clock ticking united state find cure team work...,"[clock, ticking, united, state, find, cure, te...",7
3,WELLNESS,"If you want to be busy, keep trying to be perf...",want busy keep trying perfect want happy focus...,"[want, busy, keep, trying, perfect, want, happ...",7
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...",first bad news soda bread corned beef beer hig...,"[first, bad, news, soda, bread, corned, beef, ...",7


### 🚀 Task 5: Compare Clustering Results

In [11]:
from sklearn.metrics import adjusted_rand_score

# Convert category labels into numerical form
df["category_encoded"] = pd.factorize(df["category"])[0]

# Compute similarity score between actual categories and K-Means clusters
kmeans_similarity = adjusted_rand_score(df["category_encoded"], df["kmeans_cluster"])
print(f"K-Means vs True Categories Similarity Score: {kmeans_similarity:.4f}")

# Display data to analyze LDA topics vs categories
df[["category", "kmeans_cluster"]]


K-Means vs True Categories Similarity Score: 0.0066


Unnamed: 0,category,kmeans_cluster
0,WELLNESS,3
1,WELLNESS,7
2,WELLNESS,7
3,WELLNESS,7
4,WELLNESS,7
...,...,...
49995,SPORTS,7
49996,SPORTS,7
49997,SPORTS,7
49998,SPORTS,4
