In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from scipy.sparse import hstack, csr_matrix
from sklearn.cluster import HDBSCAN

from sklearn.cluster import KMeans
from sklearn.metrics import make_scorer, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import pos_tag
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP_WORDS

from mpl_toolkits.mplot3d import Axes3D # Required for 3D projection

import gensim
from gensim.models import Word2Vec
from gensim import corpora
from gensim.models import LsiModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel

import warnings
warnings.filterwarnings('ignore')

nltk.data.path.append("./data")
nltk.download('punkt', download_dir="./data")
nltk.download("punkt_tab", download_dir="./data")
nltk.download("averaged_perceptron_tagger", download_dir="./data")
nltk.download("averaged_perceptron_tagger_eng", download_dir="./data")
nltk.download('stopwords', download_dir="./data")
nltk.download('wordnet', download_dir="./data")

import text_mining_utils as tmu

In [None]:
# Prepare Data (using Step 0 logic) ---
# Load Data ---
print("Loading data...")
try:
    df = pd.read_csv("./data/spunout_data.csv", encoding='utf-8-sig')

    # Ensure titles are strings and handle potential NaN values by replacing them with empty strings
    df['Title'] = df['Title'].fillna('').astype(str)

    # Remove rows where content extraction failed (marked as 'N/A' by the scraper)
    # Also drop rows where Content is actually NaN
    df = df[df['Content'] != 'N/A']
    df = df.dropna(subset=['Content'])

    # Concatenate the Title with the Content
    # We add a space in between to prevent the last word of the title merging with the first word of the body
    df['Content'] = df['Title'] + " " + df['Content']

    print(f"Successfully loaded {len(df)} articles.")
except FileNotFoundError:
    print(f"Error: File not found at './data/spunout_data.csv'. Please run the scraper first.")
    exit()

In [None]:
# Base Standard English Stopwords
stop_words = set(stopwords.words('english'))

# Your Domain-Specific Stopwords (Crucial for filtering "Common Words")
domain_stopwords = [
    # Site & Web specific
    'spunout', 'spun', 'out', 'ie', 'ireland', 'irish', 'www', 'http', 'https', 'com', 
    'copyright', 'privacy', 'policy', 'terms', 'conditions', 'login', 'sign', 'register',
    
    # Scraping / HTML Artifacts
    'page', 'section', 'footer', 'header', 'sidebar', 'widget', 'nav', 'advertisement', 'ad',
    'promo', 'cookie', 'script', 'javascript', 'css', 'html', 'body', 'main', 'published', 'updated',
    'author', 'post', 'article', 'url', 'permalink',
    
    # Generic Advice / High Frequency Verbs (Noise for LDA)
    'day', 'new', 'good', 'bad',
    'check', 'try', 'keep',
    'like', 'just', 'get', 'also', 'would', 'could', 'one', 'make', 'use', 'way', 'well', 
    'time', 'know', 'need', 'really', 'thing', 'think', 'much', 'even', 'still', 'another', 
    'every', 'go', 'want', 'take', 'find', 'look', 'come', 'year', 'old', 'may', 'might',
    
    # Interaction / Navigation
    'click', 'read', 'link', 'menu', 'comment', 'reply',
    
    # Text Slang / Filler
    'u', 'ur', 'im', 'dont', 'cant', 'wont', 'oh', 'ok', 'please', 'thanks', 'thank', 'yes', 'no'
]

# Merge them
all_stopwords = stop_words.union(domain_stopwords)

# --- 2. SETUP LEMMATIZER & TAGGING ---

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# ADVANCED CLEANING FUNCTION 

def clean_text(text):
    """
    Applies regex, lemmatization, and stopword removal.
    """
    # 1. Lowercase
    text = str(text).lower()
    
    # 2. Remove URLs and HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', ' ', text)
    
    # 3. Remove Page Separators (Common in scraped data)
    text = re.sub(r'[\_\-\.]{1,}', ' ', text)
    text = re.sub(r'[/\\]', ' ', text)

    # 4. Remove Apostrophes (don't -> dont) for easier tokenization
    text = re.sub(r"\'", "", text)
    
    # 5. Remove non-letters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 6. Remove single chars
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    
    # 7. Tokenize
    words = text.split()
    
    # 8. POS Tagging + Lemmatization
    # (We do this to ensure "running" becomes "run", but "runner" stays "runner")
    tagged_words = pos_tag(words)
    lemmatized_words = []
    
    for word, tag in tagged_words:
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        lemmatized_words.append(lemma)
    
    # 9. Remove Stopwords (using the merged list)
    # We also ensure words are longer than 2 chars
    filtered_words = [word for word in lemmatized_words if word not in all_stopwords and len(word) > 2]
    
    return filtered_words

# APPLY CLEANING TO DATA

print(f"Cleaning {len(df)} documents (this may take a moment due to POS tagging)...")

# Apply the function to the Content column
# result is a list of lists: [['word', 'noun'], ['another', 'list']]
texts = df['Content'].apply(clean_text).tolist()

# Filter out empty documents (documents that became empty after cleaning)
texts = [t for t in texts if len(t) > 0]
print(f"Valid documents after cleaning: {len(texts)}")


# CREATE DICTIONARY & FILTER

# Create Dictionary
dictionary = corpora.Dictionary(texts)

# FILTERING COMMON WORDS 
# This is where we enforce the "Common Words" rule mathematically.
# 1. no_below=5: Word must appear in at least 5 docs (removes typos/rare names)
# 2. no_above=0.4: Word cannot appear in more than 40% of docs.
#    (Lowering this from 0.5 removes more "glue" words that weren't in your custom list)
dictionary.filter_extremes(no_below=5, no_above=0.4)

print(f"Dictionary size after filtering: {len(dictionary)} unique tokens.")

# Create Corpus (Bag of Words)
doc_term_matrix = [dictionary.doc2bow(text) for text in texts]

print("Preprocessing complete. Ready for LDA.")

In [None]:
# Determine Optimal Topics (Your Step 2 logic)
coher_vals2 = []
lda_models = []
limit = 15; start=2; step=1  # Using 15 instead of 25 for speed in this demo

for num_topics in range(start, limit+1, step):
    # Train LDA
    lda_model = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        chunksize=2000,
        passes=30,
        iterations=500,
        random_state=43,
        update_every=1, # Update model after every chunksize docs
        alpha='auto',   # Learn asymmetric prior from data (usually better)
        eta='auto'      # Learn asymmetric topic-word prior
    )
    
    lda_models.append(lda_model)
    
    # Calculate Coherence
    coherence_model = CoherenceModel(
        model=lda_model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    coher_vals2.append(coherence_model.get_coherence())
    print(f"Topics: {num_topics} | Coherence Score: {coher_vals2[-1]:.4f}")

In [None]:
import os
# Plot Coherence Scores
x = range(start, limit+1, step)
plt.figure(figsize=(8,5))
plt.plot(x, coher_vals2, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
# plt.title("Optimal Number of Topics")
plt.grid(True)
# Change this to whatever folder name you want
folder_name = "./practical_assessment_adsah_6014_2_web_content_mining/images/"
file_name = "optimal_number_of_topics.png"

# Create the folder if it doesn't exist yet
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Combine them into a full path (handles Windows/Mac differences automatically)
full_path = os.path.join(folder_name, file_name)

plt.savefig(full_path, dpi=300, bbox_inches='tight')

print(f"\nPlot successfully saved to: {full_path}")
plt.show()

In [None]:
#  Apply Best LDA Model
print("\nApplying Best Model ---")

# Find the index of the maximum coherence score
optimal_num_topics_index = coher_vals2.index(max(coher_vals2))
optimal_num_topics = x[optimal_num_topics_index]
print(f"Optimal Number of Topics: {optimal_num_topics}")

In [None]:
# Select the best model from our list
best_lda_model = lda_models[optimal_num_topics_index]

# Print the topics
# Each topic is a list of (word, probability) tuples
print(f"\nTop 10 words for each of the {optimal_num_topics} topics:")
topics = best_lda_model.print_topics(num_words=10)
for topic_id, topic_content in topics:
    print(f"Topic {topic_id}: {topic_content}")